In [1]:
# %mkdir ../data
# !wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

In [1]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

In [2]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [3]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""
    
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test

In [4]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [5]:
print(train_X[100])
print(train_y[100])

In a famous essay he wrote about Charles Dickens, George Orwell points out that many readers always regretted that Dickens never continued writing like he did in PICKWICK PAPERS: that is, he did not stick to writing funny episodic novels for the rest of his career. This would not have been too difficult for Dickens. His contemporary Robert Surtees did precisely that, only concentrating on the misadventures of the fox hunting set (MR. FANCY ROMFORD'S HOUNDS is a title of one of his novels). Among hunters and horse lovers Surtees still has a following but most people find his novels unreadable. Dickens was determined to show he was more than a funny man (and don't forget, his first book, SKETCHES BY BOZ, was also a funny book). So Dickens third book is OLIVER TWIST (which got pretty grim at points). Orwell says that for any author to grow they have to change the style of their books. Dickens would definitely (and successfully) have agreed to that.<br /><br />But Orwell overlooked the gen

In [6]:
print(len(train_X[100]))

5034


In [7]:
!pip install nltk



In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [9]:
review_to_words(train_X[100])

['famou',
 'essay',
 'wrote',
 'charl',
 'dicken',
 'georg',
 'orwel',
 'point',
 'mani',
 'reader',
 'alway',
 'regret',
 'dicken',
 'never',
 'continu',
 'write',
 'like',
 'pickwick',
 'paper',
 'stick',
 'write',
 'funni',
 'episod',
 'novel',
 'rest',
 'career',
 'would',
 'difficult',
 'dicken',
 'contemporari',
 'robert',
 'surte',
 'precis',
 'concentr',
 'misadventur',
 'fox',
 'hunt',
 'set',
 'mr',
 'fanci',
 'romford',
 'hound',
 'titl',
 'one',
 'novel',
 'among',
 'hunter',
 'hors',
 'lover',
 'surte',
 'still',
 'follow',
 'peopl',
 'find',
 'novel',
 'unread',
 'dicken',
 'determin',
 'show',
 'funni',
 'man',
 'forget',
 'first',
 'book',
 'sketch',
 'boz',
 'also',
 'funni',
 'book',
 'dicken',
 'third',
 'book',
 'oliv',
 'twist',
 'got',
 'pretti',
 'grim',
 'point',
 'orwel',
 'say',
 'author',
 'grow',
 'chang',
 'style',
 'book',
 'dicken',
 'would',
 'definit',
 'success',
 'agre',
 'orwel',
 'overlook',
 'genr',
 'writer',
 'transcend',
 'fellow',
 'surte',
 's

In [29]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
        cache_data = None
        if cache_file is not None:
            try:
                with open(os.path.join(cache_dir, cache_file), "rb") as f:
                    cache_data = pickle.load(f)
                print("Read preprocessed data from cache file:", cache_file)
            except:
                pass  # unable to read from cache, but that's okay
    
        if cache_data is None:
            words_train = [review_to_words(review) for review in data_train]
            words_test = [review_to_words(review) for review in data_test]
        
            # Write to cache file for future runs
            if cache_file is not None:
                cache_data = dict(words_train=words_train, words_test=words_test,
                                  labels_train=labels_train, labels_test=labels_test)
                with open(os.path.join(cache_dir, cache_file), "wb") as f:
                    pickle.dump(cache_data, f)
                    print("Wrote preprocessed data to cache file:", cache_file)
        else:
            # Unpack data loaded from cache file
            words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                    cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])

        return words_train, words_test, labels_train, labels_test

In [30]:
# Preprocess data
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


In [34]:
len(test_X)

25000

In [35]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    word_count = {} # A dict storing the words that appear in the reviews along with how often they occur
    
    for item in data:
        for word in item:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] +=1
    
    sorted_words = sorted(word_count, key=word_count.get, reverse=True)
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): 
        word_dict[word] = idx + 2                              
        
    return word_dict

In [38]:
word_dict = build_dict(train_X)
word_dict

{'movi': 2,
 'film': 3,
 'one': 4,
 'like': 5,
 'time': 6,
 'good': 7,
 'make': 8,
 'charact': 9,
 'get': 10,
 'see': 11,
 'watch': 12,
 'stori': 13,
 'even': 14,
 'would': 15,
 'realli': 16,
 'well': 17,
 'scene': 18,
 'look': 19,
 'show': 20,
 'much': 21,
 'end': 22,
 'peopl': 23,
 'bad': 24,
 'go': 25,
 'great': 26,
 'also': 27,
 'first': 28,
 'love': 29,
 'think': 30,
 'way': 31,
 'act': 32,
 'play': 33,
 'made': 34,
 'thing': 35,
 'could': 36,
 'know': 37,
 'say': 38,
 'seem': 39,
 'work': 40,
 'plot': 41,
 'two': 42,
 'actor': 43,
 'year': 44,
 'come': 45,
 'mani': 46,
 'seen': 47,
 'take': 48,
 'life': 49,
 'want': 50,
 'never': 51,
 'littl': 52,
 'best': 53,
 'tri': 54,
 'man': 55,
 'ever': 56,
 'give': 57,
 'better': 58,
 'still': 59,
 'perform': 60,
 'find': 61,
 'feel': 62,
 'part': 63,
 'back': 64,
 'use': 65,
 'someth': 66,
 'director': 67,
 'actual': 68,
 'interest': 69,
 'lot': 70,
 'real': 71,
 'old': 72,
 'cast': 73,
 'though': 74,
 'live': 75,
 'star': 76,
 'enjoy': 7

In [39]:
top_five = []
for word, index in word_dict.items():
    if len(top_five) >= 5:
        break
    top_five.append(word)
top_five

['movi', 'film', 'one', 'like', 'time']

In [40]:
data_dir = '../data/pytorch' 
if not os.path.exists(data_dir): 
    os.makedirs(data_dir)

In [41]:
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [42]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 
    INFREQ = 1 
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [43]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)

In [45]:
train_X

array([[  47,  480,  672, ...,    0,    0,    0],
       [   2, 4248,    1, ...,    0,    0,    0],
       [2117,  334, 1681, ...,    0,    0,    0],
       ...,
       [ 466,  192,    1, ...,    0,    0,    0],
       [ 254,  214,  326, ...,    0,    0,    0],
       [1009,  342,  357, ...,    0,    0,    0]])

In [46]:
print(train_X[100])
print(train_X_len[100])

[ 591  573   46   23  512 1016 2043 1573 2681  174    2  562   21 4967
   90    3  311   11   79  132 1422   47    2  172 2251   12 2310 1573
 2681    2   59 1573 2681    2  311  184  113  685  822  728    3 1016
 2043 3204 1386 1310   30   59 1573 2681  371    3 1239  793   33   17
    1 1205  404 2956 2681    2  178    5    4    3   14  848 3185    1
 1994    1  871 2043  177   43   36  129 2750 1442    1  265    3   57
   98   28  145  881  416  523 1573 2681 4641  331  822   28 2681 2681
 1455  167  822   87  366  228 2681   37  139   93  256    2   47   28
    3  197   76 4781 2886  220   37  822  517  129 1292   42    3  368
  509   33 1534   93  169    3  704 1121    1 1633  890   13  117 2681
  459  193  881  249  416   90   28  648  144  179 3671  958  108    1
   22    2  573 3642   45   64  606   61  887   92   39 1101 2043 1311
    1  881   23  149  521 3281  571   22   28 1016 2043   22   14 1849
 3281  146  584 2714  909 1016 2043    1    1    8  196  495  152 1719
    4 

In [47]:
import pandas as pd
    
pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [48]:
import sagemaker

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sentiment_rnn'

role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [50]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [51]:
!pip install torchvision 




In [52]:
import torch
import torch.utils.data
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None, nrows=250)
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=50)

In [53]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            out = model.forward(batch_X)
            loss = loss_fn(out, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

In [54]:
import torch.optim as optim
from train.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 5, optimizer, loss_fn, device)

cpu
Epoch: 1, BCELoss: 0.6927214384078979
Epoch: 2, BCELoss: 0.6826396107673645
Epoch: 3, BCELoss: 0.6736706495285034
Epoch: 4, BCELoss: 0.6635082960128784
Epoch: 5, BCELoss: 0.6507992506027221


In [55]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(entry_point="train.py",
                    source_dir="train",
                    role=role,
                    framework_version='1.8.1',  
                    py_version="py3",   
                    train_instance_count=1,
                    train_instance_type='ml.m4.xlarge', 
                    hyperparameters={
                        'epochs': 10,
                        'hidden_dim': 200,
                    })

estimator.fit({'training': input_data})

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-11-20-06-14-08-087


Using provided s3_resource
2023-11-20 06:14:08 Starting - Starting the training job...
2023-11-20 06:14:33 Starting - Preparing the instances for training.........
2023-11-20 06:15:55 Downloading - Downloading input data...
2023-11-20 06:16:30 Training - Downloading the training image......
2023-11-20 06:17:25 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-11-20 06:17:40,399 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-11-20 06:17:40,402 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-11-20 06:17:40,413 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-11-20 06:17:40,416 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-11-20 06

[34mEpoch: 1, BCELoss: 0.6690408940217933[0m
[34mEpoch: 2, BCELoss: 0.5850774889089623[0m
[34mEpoch: 3, BCELoss: 0.5253639409736711[0m
[34mEpoch: 4, BCELoss: 0.448875014271055[0m
[34mEpoch: 5, BCELoss: 0.4460091006999113[0m
[34mEpoch: 6, BCELoss: 0.38000536208250085[0m
[34mEpoch: 7, BCELoss: 0.33870953078172644[0m
[34mEpoch: 8, BCELoss: 0.3191174469432052[0m
[34mEpoch: 9, BCELoss: 0.30293984133370067[0m

2023-11-20 07:01:36 Uploading - Uploading generated training model[34mEpoch: 10, BCELoss: 0.2937456564027436[0m
[34m2023-11-20 07:01:30,474 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m

2023-11-20 07:01:52 Completed - Training job completed
Training seconds: 2757
Billable seconds: 2757


In [None]:
estimator_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
test_X = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1)

In [None]:
def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = np.array([])
    for array in split_array:
        predictions = np.append(predictions, estimator_predictor.predict(array))
    
    return predictions

In [None]:
predictions = predict(test_X.values)
predictions = [round(num) for num in predictions]

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

In [None]:
test_review = 'The simplest pleasures in life are the best, and this film is one of them. Combining a rather basic storyline of love and adventure this movie transcends the usual weekend fair with wit and unmitigated charm.'

In [None]:
test_review_X, test_review_len = convert_and_pad(word_dict, review_to_words(test_review))

In [None]:
test_data = np.hstack((test_review_len, test_review_X))
test_data = test_data.reshape(1, -1)

In [15]:
test_data.shape

NameError: name 'test_data' is not defined

In [16]:
estimator_predictor.predict(test_data)

NameError: name 'estimator_predictor' is not defined