In [1]:
# Basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import boto3

import os

In [2]:
#!pip install joblib

In [3]:
#!pip install sagemaker==1.72.0

# Sagemaker
import sagemaker

In [4]:
X_train = pd.read_csv('../output/xtrain.csv', header = None)
X_test =  pd.read_csv('../output/xtest.csv', header = None)

y_train = pd.read_csv('../output/ytrain.csv', header = None)
y_test =  pd.read_csv('../output/ytest.csv', header = None)

In [5]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,44,1171,23,763,44,1415,454,1,23,2,...,0,0,0,0,0,0,0,0,0,0
1,166,166,13,166,166,22,433,414,5,1,...,0,0,0,0,0,0,0,0,0,0
2,21,42,33,16,12,120,1422,1578,401,195,...,0,0,0,0,0,0,0,0,0,0
3,1569,600,1,663,1562,142,320,172,28,6,...,0,0,0,0,0,0,0,0,0,0
4,11,238,44,330,633,1680,978,61,1,316,...,0,0,0,0,0,0,0,0,0,0


## 2. Upload the data to S3

In [7]:
assert len(y_train) == len(X_train)
#len(X_train)

In [8]:
data_dir = '../output/'

In [9]:
aws_data = pd.concat([y_train, X_train], axis=1)

aws_data.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [10]:
aws_data.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,0,7,93,819,1,269,201,1856,180,88,...,0,0,0,0,0,0,0,0,0,0
1,0,1368,122,1275,2,2577,758,186,391,93,...,0,0,0,0,0,0,0,0,0,0
2,0,994,894,637,849,2152,69,51,40,821,...,0,0,0,0,0,0,0,0,0,0
3,0,7,11,51,99,1,48,26,4,3,...,0,0,0,0,0,0,0,0,0,0
4,1,1512,1049,316,27,4,178,308,688,11,...,0,0,0,0,0,0,0,0,0,0


### 2.2. Uploading training data

In [11]:
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

In [12]:
data_dir = "../output"
prefix = 'positiveness-lyrics'

# upload all data to S3
data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(data)

s3://sagemaker-us-east-2-890904620905/positiveness-lyrics


In [13]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

positiveness-lyrics/train.csv
positiveness-lyrics/xtest.csv
positiveness-lyrics/xtrain.csv
positiveness-lyrics/ytest.csv
positiveness-lyrics/ytrain.csv
Test passed!


### Train Base Model - naiveBayes

In [30]:
# your import and estimator code, here
from sagemaker.sklearn.estimator import SKLearn

output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate our custom SKLearn estimator
estimator = SKLearn(entry_point='train.py',
                    source_dir='../utilis/base_model',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    output_path=output_path,
                    py_version='py3',
                    framework_version='0.23-1',
                    sagemaker_session=sagemaker_session
                   
                   )

In [31]:
%%time

# Train your estimator on S3 training data
estimator.fit({'train': data})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-04-29 20:13:45 Starting - Starting the training job...
2021-04-29 20:13:47 Starting - Launching requested ML instances......
2021-04-29 20:15:00 Starting - Preparing the instances for training.........
2021-04-29 20:16:19 Downloading - Downloading input data...
2021-04-29 20:17:01 Training - Downloading the training image..[34m2021-04-29 20:17:23,636 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-04-29 20:17:23,639 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 20:17:23,650 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-04-29 20:17:24,072 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 20:17:27,099 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 20:17:27,112 sagemaker-training-toolkit INFO     No GPUs detected (normal if 

### Deploy and evaluate base model

In [32]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------------!

In [44]:
from sklearn.metrics import accuracy_score

# generate
y_preds  = predictor.predict(X_test)
print(y_preds)
accuracy = accuracy_score(y_preds, y_test)

print(f"The estimator had an accuracy of {accuracy:.2%} in the test set")

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
The estimator had an accuracy of 51.50% in the test set


## Main model - LSTM Network

In [16]:
import torch
import torch.utils.data

train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [22]:
train_sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,0,7,93,819,1,269,201,1856,180,88,...,0,0,0,0,0,0,0,0,0,0
1,0,1368,122,1275,2,2577,758,186,391,93,...,0,0,0,0,0,0,0,0,0,0
2,0,994,894,637,849,2152,69,51,40,821,...,0,0,0,0,0,0,0,0,0,0
3,0,7,11,51,99,1,48,26,4,3,...,0,0,0,0,0,0,0,0,0,0
4,1,1512,1049,316,27,4,178,308,688,11,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0,113,1,15,113,188,35,430,2,23,...,0,0,0,0,0,0,0,0,0,0
246,1,415,673,14,151,104,1118,156,764,1257,...,0,0,0,0,0,0,0,0,0,0
247,0,67,189,30,67,316,40,779,37,16,...,0,0,0,0,0,0,0,0,0,0
248,1,1478,514,548,101,2931,155,116,64,1478,...,0,0,0,0,0,0,0,0,0,0


In [17]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            # clear gradiants
            optimizer.zero_grad()
            
            # compute the model output
            out = model(batch_X)
            
            # calculate loss
            loss = loss_fn(out, batch_y)
            
            # backward propagation
            loss.backward()
            
            # update model weights with optimizer
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

In [23]:
import torch.optim as optim
from utilis.lstm_model.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

IndexError: index 1367 is out of bounds for dimension 0 with size 499

## Clean up Resources


In [None]:
predictor.endpoint

In [None]:
boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint)


In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()