In [1]:
# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3

import os

In [2]:
!pip install sagemaker==1.72.0

# Sagemaker
import sagemaker

Collecting sagemaker==1.72.0
  Downloading sagemaker-1.72.0.tar.gz (297 kB)
[K     |████████████████████████████████| 297 kB 6.1 MB/s eta 0:00:01
Collecting smdebug-rulesconfig==0.1.4
  Downloading smdebug_rulesconfig-0.1.4-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-1.72.0-py2.py3-none-any.whl size=386358 sha256=fc15066e97086c855a47d52a80b66876ffd0a579ad4955325c3b2da36ec7edc9
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c3/58/70/85faf4437568bfaa4c419937569ba1fe54d44c5db42406bbd7
Successfully built sagemaker
Installing collected packages: smdebug-rulesconfig, sagemaker
  Attempting uninstall: smdebug-rulesconfig
    Found existing installation: smdebug-rulesconfig 1.0.1
    Uninstalling smdebug-rulesconfig-1.0.1:
      Successfully uninstalled smdebug-rulesconfig-1.0.1
  Attempting uninstall: sagemaker
    Found existing install

In [15]:
X_train = pd.read_csv('../output/xtrain.csv', header = None)
X_test =  pd.read_csv('../output/xtest.csv', header = None)

y_train = pd.read_csv('../output/ytrain.csv', header = None)
y_test =  pd.read_csv('../output/ytest.csv', header = None)

In [16]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,183,44,1,23,763,44,1,454,1,23,...,0,0,0,0,0,0,0,0,0,0
1,131,166,166,13,166,166,22,433,414,5,...,0,0,0,0,0,0,0,0,0,0
2,101,21,42,33,16,12,120,1,1,401,...,0,0,0,0,0,0,0,0,0,0
3,161,1,600,1,663,1,142,320,172,28,...,0,0,0,0,0,0,0,0,0,0
4,64,11,238,44,330,633,1,978,61,1,...,0,0,0,0,0,0,0,0,0,0


## 2. Upload the data to S3

In [17]:
assert len(y_train) == len(X_train)
#len(X_train)

In [18]:
data_dir = '../output/'

In [19]:
aws_data = pd.concat([y_train, X_train], axis=1)

aws_data.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [20]:
aws_data.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,491,492,493,494,495,496,497,498,499,500
0,0,52,7,93,819,1,269,201,1,180,...,0,0,0,0,0,0,0,0,0,0
1,0,69,1,122,1,2,1,758,186,391,...,0,0,0,0,0,0,0,0,0,0
2,0,60,994,894,637,849,1,69,51,40,...,0,0,0,0,0,0,0,0,0,0
3,0,134,7,11,51,99,1,48,26,4,...,0,0,0,0,0,0,0,0,0,0
4,1,88,1,1,316,27,4,178,308,688,...,0,0,0,0,0,0,0,0,0,0


### 2.2. Uploading training data

In [21]:
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

In [43]:
data_dir = "../output"
prefix = 'positiveness-lyrics'

# upload all data to S3
data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(data)

s3://sagemaker-us-east-2-890904620905/positiveness-lyrics


In [44]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

positiveness-lyrics/sagemaker-scikit-learn-2021-05-02-17-05-43-232/debug-output/training_job_end.ts
positiveness-lyrics/sagemaker-scikit-learn-2021-05-02-17-05-43-232/output/model.tar.gz
positiveness-lyrics/train.csv
positiveness-lyrics/word_dict.pkl
positiveness-lyrics/xtest.csv
positiveness-lyrics/xtrain.csv
positiveness-lyrics/ytest.csv
positiveness-lyrics/ytrain.csv
sagemaker-pytorch-2021-05-02-17-27-11-658/source/sourcedir.tar.gz
sagemaker-pytorch-2021-05-02-17-27-41-946/source/sourcedir.tar.gz
sagemaker-pytorch-2021-05-02-17-33-06-896/source/sourcedir.tar.gz
sagemaker-pytorch-2021-05-02-17-40-39-002/source/sourcedir.tar.gz
sagemaker-pytorch-2021-05-02-17-45-18-317/source/sourcedir.tar.gz
sagemaker-scikit-learn-2021-05-02-17-05-43-232/source/sourcedir.tar.gz
Test passed!


### Train Base Model - naiveBayes

In [24]:
# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate our custom SKLearn estimator
estimator = SKLearn(entry_point='train.py',
                    source_dir='../utilis/base_model',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    output_path=output_path,
                    py_version='py3',
                    framework_version='0.23-1',
                    sagemaker_session=sagemaker_session
                   
                   )

In [25]:
%%time

# Train your estimator on S3 training data
estimator.fit({'train': data})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-05-02 17:05:43 Starting - Starting the training job...
2021-05-02 17:05:45 Starting - Launching requested ML instances......
2021-05-02 17:06:51 Starting - Preparing the instances for training......
2021-05-02 17:08:10 Downloading - Downloading input data
2021-05-02 17:08:10 Training - Downloading the training image...
2021-05-02 17:08:43 Training - Training image download completed. Training in progress..[34m2021-05-02 17:08:43,817 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-05-02 17:08:43,820 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-02 17:08:43,830 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-05-02 17:08:44,176 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-02 17:08:44,401 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021

### Deploy and evaluate base model

In [26]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [27]:
from sklearn.metrics import accuracy_score

# generate
y_preds  = predictor.predict(X_test)
print(y_preds)
accuracy = accuracy_score(y_preds, y_test)

print(f"The estimator had an accuracy of {accuracy:.2%} in the test set")

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
The estimator had an accuracy of 51.50% in the test set


## Main model - LSTM Network

In [28]:
import torch
import torch.utils.data

train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [29]:
train_sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,500,501
0,0,52,7,93,819,1,269,201,1,180,...,0,0,0,0,0,0,0,0,0,0
1,0,69,1,122,1,2,1,758,186,391,...,0,0,0,0,0,0,0,0,0,0
2,0,60,994,894,637,849,1,69,51,40,...,0,0,0,0,0,0,0,0,0,0
3,0,134,7,11,51,99,1,48,26,4,...,0,0,0,0,0,0,0,0,0,0
4,1,88,1,1,316,27,4,178,308,688,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0,144,1,1,794,885,813,1,1,57,...,0,0,0,0,0,0,0,0,0,0
796,1,254,10,53,53,53,343,12,53,53,...,0,0,0,0,0,0,0,0,0,0
797,1,56,676,326,78,1,35,207,61,814,...,0,0,0,0,0,0,0,0,0,0
798,1,271,1,1,1,181,4,217,341,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # clear gradiants
            optimizer.zero_grad()
            
            # compute the model output
            out = model(batch_X)
            
            # calculate loss
            loss = loss_fn(out, batch_y)
            
            # backward propagation
            loss.backward()
            
            # update model weights with optimizer
            optimizer.step()
            
            total_loss += loss.data.item()
        print(f"Epoch: {epoch}, BCELoss: {total_loss / len(train_loader)}")

In [31]:
import torch.optim as optim
from utilis.lstm_model.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

Epoch: 1, BCELoss: 0.6898287311196327
Epoch: 2, BCELoss: 0.6772803962230682
Epoch: 3, BCELoss: 0.6649945974349976
Epoch: 4, BCELoss: 0.6455747671425343
Epoch: 5, BCELoss: 0.6209660843014717


## Train model

In [61]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="./utilis/lstm_model",
                    role=role,
                    framework_version='0.4.0',
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    hyperparameters={
                        'epochs': 20,
                        'hidden_dim': 200,
                    })

In [62]:
estimator.fit({'training': data})

'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2021-05-02 18:22:39 Starting - Starting the training job...
2021-05-02 18:22:41 Starting - Launching requested ML instances......
2021-05-02 18:24:03 Starting - Preparing the instances for training......
2021-05-02 18:24:54 Downloading - Downloading input data...
2021-05-02 18:25:34 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-02 18:25:35,462 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-02 18:25:35,464 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-02 18:25:35,476 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-02 18:25:36,101 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-02 18:25:36,362 sagemaker-containers INFO    

In [None]:
project_predictor = estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


------------

## Test Model

In [51]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, project_predictor.predict(array))
    
    return predictions

In [52]:
predictions = predict(X_test.values)
predictions = [round(num) for num in predictions]

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.48

## test individual lyric

In [None]:
test = "Imagine there's no heaven It's easy if you try No hell below us Above us only sky Imagine all the people Living for today"

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def lyrics_to_words(lyrics):
    '''
    helper function to clean out song lyrics. We apply porter Stemmer algorithm and remove stopwords
    '''
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = ['verse','1', '2', 'chorus', 'bridge', 'talking', 'refrain', 'explain', 'request']
    stopwords.extend(newStopWords)
    stemmer = PorterStemmer()
    
    words_english = set(nltk.corpus.words.words())

    remove_non_english = " ".join(w for w in nltk.wordpunct_tokenize(lyrics) if w.lower() in words_english or not w.isalpha())
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", remove_non_english.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [None]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

In [None]:
process_data = lyrics_to_words(test)

data_processed, length = convert_and_pad(word_dict, process_data)

test_data = np.append(length, np.array(data_processed)).reshape(1,-1) # add the length in the beggining

## Clean up Resources


In [56]:
project_predictor.endpoint

'sagemaker-pytorch-2021-05-02-17-52-48-438'

In [58]:
boto3.client('sagemaker').delete_endpoint(EndpointName=project_predictor.endpoint)


{'ResponseMetadata': {'RequestId': 'fbb6e7c9-6a14-44e4-99f2-74145bce0eac',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fbb6e7c9-6a14-44e4-99f2-74145bce0eac',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 02 May 2021 18:15:37 GMT'},
  'RetryAttempts': 0}}

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()