In [1]:
# %mkdir ../data
# !wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

In [7]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

In [8]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [9]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
        #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    return data_train, data_test, labels_train, labels_test

In [10]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [11]:
print(train_X[100])
print(train_y[100])

Expecting to see a "cute little film" from mainland China, I was ill-prepared. Family dynamics, community and the inevitability of change have rarely been explored so expertly on film. Every character is solid and I was completely drawn into the story. The organization is much more complex than American audiences will be accustomed to. Yet, there is no difficulty following the progression, even while reading subtitles. Jiang Wu, as the retarded brother, is a constant shining light. Leave your cynicism in your locker. It will be there when you check out.
1


In [12]:
print(len(train_X[100]))

559


In [13]:
!pip install nltk



In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [15]:
review_to_words(train_X[100])

['expect',
 'see',
 'cute',
 'littl',
 'film',
 'mainland',
 'china',
 'ill',
 'prepar',
 'famili',
 'dynam',
 'commun',
 'inevit',
 'chang',
 'rare',
 'explor',
 'expertli',
 'film',
 'everi',
 'charact',
 'solid',
 'complet',
 'drawn',
 'stori',
 'organ',
 'much',
 'complex',
 'american',
 'audienc',
 'accustom',
 'yet',
 'difficulti',
 'follow',
 'progress',
 'even',
 'read',
 'subtitl',
 'jiang',
 'wu',
 'retard',
 'brother',
 'constant',
 'shine',
 'light',
 'leav',
 'cynic',
 'locker',
 'check']

In [16]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")  
os.makedirs(cache_dir, exist_ok=True)  

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  
 is missing, then do the heavy lifting
    if cache_data is None:
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
                if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    return words_train, words_test, labels_train, labels_test

In [17]:


train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


In [18]:
len(test_X)

25000

In [19]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    word_count = {} 
    
    for item in data:
        for word in item:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] +=1

    sorted_words = sorted(word_count, key=word_count.get, reverse=True)
    
    word_dict = {} 
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): 
        word_dict[word] = idx + 2                             
        
    return word_dict

In [20]:
word_dict = build_dict(train_X)

In [21]:
top_five = []
for word, index in word_dict.items():
    if len(top_five) >= 5:
        break
    top_five.append(word)
top_five

['movi', 'film', 'one', 'like', 'time']

In [22]:
data_dir = '../data/pytorch'
if not os.path.exists(data_dir): 
    os.makedirs(data_dir)

In [23]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [24]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 
    INFREQ = 1 
    working_sentence = [NOWORD] * pad
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)
def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [25]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

In [26]:
len(train_X)

25000

In [27]:
print(train_X[100])
print(train_X_len[100])

[ 591  573   46   23  512 1016 2043 1573 2681  174    2  562   21 4967
   90    3  311   11   79  132 1422   47    2  172 2251   12 2310 1573
 2681    2   59 1573 2681    2  311  184  113  685  822  728    3 1016
 2043 3204 1386 1310   30   59 1573 2681  371    3 1239  793   33   17
    1 1205  404 2956 2681    2  178    5    4    3   14  848 3185    1
 1994    1  871 2043  177   43   36  129 2750 1442    1  265    3   57
   98   28  145  881  416  523 1573 2681 4641  331  822   28 2681 2681
 1455  167  822   87  366  228 2681   37  139   93  256    2   47   28
    3  197   76 4781 2886  220   37  822  517  129 1292   42    3  368
  509   33 1534   93  169    3  704 1121    1 1633  890   13  117 2681
  459  193  881  249  416   90   28  648  144  179 3671  958  108    1
   22    2  573 3642   45   64  606   61  887   92   39 1101 2043 1311
    1  881   23  149  521 3281  571   22   28 1016 2043   22   14 1849
 3281  146  584 2714  909 1016 2043    1    1    8  196  495  152 1719
    4 

In [28]:
import pandas as pd
    
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [29]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [30]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [31]:
!pygmentize train/model.py

[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36mnn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mclass[39;49;00m [04m[32mLSTMClassifier[39;49;00m(nn.Module):[37m[39;49;00m
[37m    [39;49;00m[33m"""[39;49;00m
[33m    This is the simple RNN model we will be using to perform Sentiment Analysis.[39;49;00m
[33m    """[39;49;00m[37m[39;49;00m
[37m[39;49;00m
    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, embedding_dim, hidden_dim, vocab_size):[37m[39;49;00m
[37m        [39;49;00m[33m"""[39;49;00m
[33m        Initialize the model by settingg up the various layers.[39;49;00m
[33m        """[39;49;00m[37m[39;49;00m
        [36msuper[39;49;00m(LSTMClassifier, [36mself[39;49;00m).[32m__init__[39;49;00m()[37m[39;49;00m
[37m[39;49;00m
        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=[34m0[39;49;00m)[37m[39;49;00

In [32]:
!pip install torchvision 




In [33]:
import torch
import torch.utils.data

# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [34]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # TODO: Complete this train method to train the model provided.
            optimizer.zero_grad()
            out = model.forward(batch_X)
            loss = loss_fn(out, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

In [35]:
import torch.optim as optim
from train.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

cpu
Epoch: 1, BCELoss: 0.6934341311454773
Epoch: 2, BCELoss: 0.6841942071914673
Epoch: 3, BCELoss: 0.6762064814567565
Epoch: 4, BCELoss: 0.6671162843704224
Epoch: 5, BCELoss: 0.6555431365966797


In [None]:
from sagemaker.pytorch import PyTorch

# Define the PyTorch estimator
estimator = PyTorch(entry_point="train.py",
                    source_dir="train",
                    role=role,
                    framework_version='1.8.1',  # Adjust the version to the latest available
                    py_version="py3",   
                    train_instance_count=1,
                    train_instance_type='ml.m4.xlarge',  # Use CPU instance type
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200,
                    })

# Fit the estimator with the training data
estimator.fit({'training': input_data})

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-11-18-11-20-21-374


Using provided s3_resource
2023-11-18 11:20:21 Starting - Starting the training job...
2023-11-18 11:20:48 Starting - Preparing the instances for training.........
2023-11-18 11:22:05 Downloading - Downloading input data...
2023-11-18 11:22:35 Training - Downloading the training image......
2023-11-18 11:23:31 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-11-18 11:23:47,037 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-11-18 11:23:47,040 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-11-18 11:23:47,051 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-11-18 11:23:47,053 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-11-18 11

[34mEpoch: 1, BCELoss: 0.6690408940217933[0m
[34mEpoch: 4, BCELoss: 0.448875014271055[0m
[34mEpoch: 5, BCELoss: 0.4460091006999113[0m
[34mEpoch: 6, BCELoss: 0.38000536208250085[0m
[34mEpoch: 7, BCELoss: 0.33870953078172644[0m
[34mEpoch: 8, BCELoss: 0.3191174469432052[0m
[34mEpoch: 9, BCELoss: 0.30293984133370067[0m

2023-11-18 12:05:50 Uploading - Uploading generated training model[34mEpoch: 10, BCELoss: 0.2937456564027436[0m
[34m2023-11-18 12:05:44,412 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m

2023-11-18 12:06:01 Completed - Training job completed
Training seconds: 2636
Billable seconds: 2636


In [None]:
# TODO: Deploy the trained model
estimator_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-575437452043/pytorch-training-2023-11-18-11-20-21-374/output/model.tar.gz), script artifact (s3://sagemaker-us-east-1-575437452043/pytorch-training-2023-11-18-11-20-21-374/source/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-575437452043/pytorch-training-2023-11-18-12-06-34-304/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-training-2023-11-18-12-06-34-304
INFO:sagemaker:Creating endpoint-config with name pytorch-training-2023-11-18-12-06-34-304
INFO:sagemaker:Creating endpoint with name pytorch-training-2023-11-18-12-06-34-304


-------!

In [58]:
test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)

In [59]:
# We split the data into chunks and send each chunk seperately, accumulating the results.

def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, estimator_predictor.predict(array))
    
    return predictions

In [60]:
predictions = predict(test_X.values)
predictions = [round(num) for num in predictions]

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.84304

In [63]:
test_review = 'The simplest pleasures in life are the best, and this film is one of them. Combining a rather basic storyline of love and adventure this movie transcends the usual weekend fair with wit and unmitigated charm.'

In [64]:
test_review_X, test_review_len = convert_and_pad(word_dict, review_to_words(test_review))

In [65]:
test_data = np.hstack((test_review_len, test_review_X))
test_data = test_data.reshape(1, -1)

In [66]:
test_data.shape

(1, 501)

In [67]:
estimator_predictor.predict(test_data)

array(0.96593529)

In [68]:
estimator.delete_endpoint()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [69]:
!pygmentize serve/predict.py

[37m# import argparse[39;49;00m[37m[39;49;00m
[37m# import json[39;49;00m[37m[39;49;00m
[37m# import os[39;49;00m[37m[39;49;00m
[37m# import pickle[39;49;00m[37m[39;49;00m
[37m# import sys[39;49;00m[37m[39;49;00m
[37m# import pandas as pd[39;49;00m[37m[39;49;00m
[37m# import numpy as np[39;49;00m[37m[39;49;00m
[37m# import torch[39;49;00m[37m[39;49;00m
[37m# import torch.nn as nn[39;49;00m[37m[39;49;00m
[37m# import torch.optim as optim[39;49;00m[37m[39;49;00m
[37m# import torch.utils.data[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# from model import LSTMClassifier[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# from utils import review_to_words, convert_and_pad[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# def model_fn(model_dir):[39;49;00m[37m[39;49;00m
[37m#     """Load the PyTorch model from the `model_dir` directory."""[39;49;00m[37m[39;49;00m
[37m#     print("Loading model.")[39;49;00m[37m[39;49;

In [70]:
from sagemaker.predictor import RealTimePredictor
from sagemaker.pytorch import PyTorchModel

class StringPredictor(RealTimePredictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')

model = PyTorchModel(model_data=estimator.model_data,
                     role = role,
                     framework_version='1.8.1',
                     py_version="py3",   
                     entry_point='predict.py',
                     source_dir='serve',
                     predictor_cls=StringPredictor)
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-575437452043/pytorch-training-2023-11-18-11-20-21-374/output/model.tar.gz), script artifact (serve), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-575437452043/pytorch-inference-2023-11-18-12-21-58-895/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2023-11-18-12-21-59-693
INFO:sagemaker:Creating endpoint-config with name pytorch-inference-2023-11-18-12-22-00-405
INFO:sagemaker:Creating endpoint with name pytorch-inference-2023-11-18-12-22-00-405


------!

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [77]:
def test_reviews(predictor, data_dir='../data/aclImdb', stop=250):
    results = []
    ground = []

    # We make sure to test both positive and negative reviews
    for sentiment in ['pos', 'neg']:
        path = os.path.join(data_dir, 'test', sentiment, '*.txt')
        files = glob.glob(path)

        files_read = 0

        print('Starting ', sentiment, ' files')

        # Iterate through the files and send them to the predictor
        for f in files:
            with open(f) as review:
                # First, we store the ground truth (was the review positive or negative)
                if sentiment == 'pos':
                    ground.append(1)
                else:
                    ground.append(0)
                # Read in the review and convert to 'utf-8' for transmission via HTTP
                review_input = review.read().encode('utf-8')
                # Send the review to the predictor and store the results
                results.append(float(predictor.predict(review_input)))

            # Sending reviews to our endpoint one at a time takes a while, so we
            # only send a small number of reviews
            files_read += 1
            if files_read == stop:
                break

    return ground, results


In [84]:
# Assuming you have a SageMaker Predictor object named 'predictor'
accept_header = 'text/plain'  # Specify the expected response content type
endpoint_name = 'pytorch-inference-2023-11-18-12-22-00-405'  # Replace with your actual endpoint name
predictor = sagemaker.predictor.RealTimePredictor(endpoint_name=endpoint_name, content_type=accept_header)

ground, results = test_reviews(predictor)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Starting  pos  files
Starting  neg  files


In [86]:
from sklearn.metrics import accuracy_score
accuracy_score(ground, results)

0.854

In [87]:
predictor.predict(test_review)

b'1.0'

In [88]:
predictor.endpoint

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'pytorch-inference-2023-11-18-12-22-00-405'

In [89]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: pytorch-inference-2023-11-18-12-22-00-405
INFO:sagemaker:Deleting endpoint with name: pytorch-inference-2023-11-18-12-22-00-405
