In [1]:
!rm -f aclImdb_v1.tar.gz
!rm -Rf aclImdb
!rm -f movie_data.csv
!rm -f movie_data_train.csv
!rm -f movie_data_validation.csv

In [2]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2018-09-26 21:25:49--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2018-09-26 21:25:59 (8.34 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
!tar xzf aclImdb_v1.tar.gz

In [4]:
import os

numberOfFiles = 0
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/%s/%s' % (s, l)
        numberOfFiles += len(os.listdir(path))

print 'Number of files in dataset ' + str(numberOfFiles)

Number of files in dataset 50000


In [5]:
!pip install pyprind
!pip install nltk



In [6]:
import pyprind
import pandas as pd
import os

pbar = pyprind.ProgBar(numberOfFiles)

labels = {'pos':'1', 'neg':'0'}


df = pd.DataFrame()
for s in ('test', 'train'):
	for l in ('pos', 'neg'):
		path ='./aclImdb/%s/%s' % (s, l)
		
		for file in os.listdir(path):
			with open(os.path.join(path, file), 'r') as infile:
				txt = infile.read()
				df = df.append([['__label__' + labels[l], txt]], ignore_index=True)
			pbar.update()

df.columns = ['label', 'text']


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:28


In [7]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

In [8]:
df = pd.read_csv('./movie_data.csv')
df_row_count = len(df.index)
df.head(5)

Unnamed: 0,label,text
0,__label__1,"I came out of ""Dark Blue World"" feeling (sigh)..."
1,__label__0,"About five years ago, my friend and I went to ..."
2,__label__0,The one sheets and newspaper campaign suggeste...
3,__label__1,The Williams family live on a ranch located in...
4,__label__0,This film wasn't programmed in Italian cinemas...


In [9]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

porter = PorterStemmer()
nltk.download('stopwords')
stop = stopwords.words('english')

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text + ''.join(emoticons).replace('-', '')
    text = tokenizer_porter(text)
    text = [w for w in text if w not in stop]
    text = ' '.join(text)
    return text

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
pbar = pyprind.ProgBar(df_row_count)

for index, row in df.iterrows():
    df.at[index, 'text'] = preprocessor(row['text'])
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:07:13


In [11]:
df.head(5)

Unnamed: 0,label,text
0,__label__1,came dark blue world feel sigh sad yet wa defi...
1,__label__0,five year ago friend went video rental store g...
2,__label__0,one sheet newspap campaign suggest often far l...
3,__label__1,william famili live ranch locat middl remot de...
4,__label__0,thi film program italian cinema seen manifest ...


In [12]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df, test_size=0.2)

In [13]:
import csv

train.to_csv('movie_data_train.csv', header = False, sep='\t', index=False, quoting=csv.QUOTE_NONE)
validation.to_csv('movie_data_validation.csv', header = False, sep='\t', index=False, quoting=csv.QUOTE_NONE)

In [14]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()

role = get_execution_role()
print(role)

bucket = 'dc-machine-learning'
prefix = 'sagemaker_imdb_sentiment_analysis'

train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)

sess.upload_data(path='./movie_data_train.csv', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='./movie_data_validation.csv', bucket=bucket, key_prefix=validation_channel)

arn:aws:iam::748487307014:role/service-role/AmazonSageMaker-ExecutionRole-20180926T205031


's3://dc-machine-learning/sagemaker_imdb_sentiment_analysis/validation/movie_data_validation.csv'

In [15]:
region_name = boto3.Session().region_name
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.m4.xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 30,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=2,
                            learning_rate=0.05,
                            vector_dim=10,
                            early_stopping=True,
                            patience=4,
                            min_epochs=5,
                            word_ngrams=2)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

bt_model.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: blazingtext-2018-09-26-21-34-52-840


Using SageMaker BlazingText container: 813361260812.dkr.ecr.eu-central-1.amazonaws.com/blazingtext:latest (eu-central-1)
2018-09-26 21:34:53 Starting - Starting the training job...
Launching requested ML instances......
Preparing the instances for training...
2018-09-26 21:36:50 Downloading - Downloading input data...
2018-09-26 21:37:05 Training - Training image download completed. Training in progress.
[31mArguments: train[0m
[31m[09/26/2018 21:37:12 INFO 139882596902720] nvidia-smi took: 0.0252239704132 secs to identify 0 gpus[0m
[31m[09/26/2018 21:37:12 INFO 139882596902720] Running single machine CPU BlazingText training using supervised mode.[0m
[31m[09/26/2018 21:37:12 INFO 139882596902720] Processing /opt/ml/input/data/train/movie_data_train.csv . File size: 29 MB[0m
[31m[09/26/2018 21:37:12 INFO 139882596902720] Processing /opt/ml/input/data/validation/movie_data_validation.csv . File size: 7 MB[0m
[31mRead 5M words[0m
[31mNumber of words:  39191[0m
[31mLoading 

In [16]:
text_classifier = bt_model.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: blazingtext-2018-09-26-21-38-04-734
INFO:sagemaker:Creating endpoint with name blazingtext-2018-09-26-21-34-52-840


--------------------------------------------------------------!

In [27]:
sentences = [
    "I love latest product update :)",
    "I hate what was implemented recently :("
]

tokenized_sentences = [''.join(preprocessor(sent)) for sent in sentences]

print(tokenized_sentences)

payload = {"instances" : tokenized_sentences}

response = text_classifier.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[u'love latest product updat :)', u'hate wa implement recent :(']
[
  {
    "prob": [
      0.9930997490882874
    ], 
    "label": [
      "__label__1"
    ]
  }, 
  {
    "prob": [
      0.8289283514022827
    ], 
    "label": [
      "__label__0"
    ]
  }
]
