In [1]:
import boto3
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()
role = get_execution_role()
region_name = boto3.Session().region_name

s3 = boto3.client('s3')

In [2]:
container = get_image_uri(region_name, 'ntm')

bucket_name = 'e15-nlp-amazon-review-polarity'
train_data_key = 'train.csv'
test_data_key = 'test.csv'

train_data = pd.read_csv(s3.get_object(Bucket = bucket_name,
                                Key = train_data_key)['Body'], 
                                header = None, 
                                names = ['class_label','review_title',
                                        'review_text'])
train_data.shape

(3600000, 3)

In [18]:
test_data = pd.read_csv(s3.get_object(Bucket=bucket_name,
                                        Key=test_data_key)['Body'],
                                        header=None,
                                        names=['class_label','review_title',
                                        'review_text'])

In [3]:
#common_spanish_words = ['libro','que','el','la','una','para','bien','eso',
                      # 'aquí','todo','vamos','muey','estoy','tengo','cuando'
                      # 'lo','pero','para','esta','los']
#common_spanish_words = '|'.join(common_spanish_words)

train_data_neg = train_data[train_data['class_label']==1]
comprehend = boto3.client('comprehend')

In [23]:
#import re
#train_data = train_data[train_data['class_label'] == 1]
##train_data_en = train_data[~mask]
#train_data_esp = train_data[mask]

In [4]:
train_data.reset_index(inplace = True, drop = True)
def try_and_find_lan(text):
    try:
        result = comprehend.detect_dominant_language(Text = text)
        return result['Languages'][0]['LanguageCode']
    except Exception as e:
        print(e)
        return 'N/A'

#train_data['language'] = train_data['review_text'].apply(lambda x: try_and_find_lan(x))


In [4]:
def batch_detect_language(text_column):
    import math
    
    textList = text_column.tolist()
    
    batch_size = 25
    
    num_batches = math.ceil(len(textList) / batch_size)
    
    langs = [''] * len(textList)
    
    for batch in range(num_batches):
        
        start = batch * batch_size
        end = batch_size * (batch + 1)
        
        try:
            textBatch = textList[start:end]
            
        except IndexError:
            textBatch = textList[start:]
    
        results = comprehend.batch_detect_dominant_language(TextList=textBatch)
    
        for result in results['ResultList']:

            index = result['Index'] + start

            lang = result['Languages']

            if len(lang) == 1:
                lang = lang[0]['LanguageCode']
            else:
                lang = 'multi'

            langs[index] = lang
    
    return langs

In [8]:
train_sample = train_data_neg.sample(n=100000)

textList = train_sample['review_text']

train_sample['lang'] = batch_detect_language(textList)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.68 µs


In [16]:
train_set = train_sample[train_sample['lang'] == 'en']

train_set_idx = train_set.index

In [12]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re
token_pattern = re.compile(r"(?u)\b\w\w+\b")

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in doc.split() if len(t) >= 2 and re.match("[a-z].*",t) 
                and re.match(token_pattern, t)]

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import gc

train_doc_list_dirty = train_set['review_text'].tolist()
val_doc_list_dirty = train_data_neg.drop(train_set_idx, axis=0)['review_text'].tolist()
test_doc_list_dirty = test_data['review_text'].tolist()

print(len(train_doc_list_dirty), len(val_doc_list_dirty), len(test_doc_list_dirty))
#train_doc_list, test_doc_list = train_test_split(train_data)
#test_doc_list, val_doc_list = train_test_split(test_doc_list)
#del train_data
#gc.collect()

99826 1700174 400000


In [41]:
#Get rid of all punctuation

import re

s = re.sub(r'[^\w\s]','',s)

train_doc_list = [re.sub(r'[^\w\s]','',s) for s in train_doc_list_dirty]
val_doc_list = [re.sub(r'[^\w\s]','',s) for s in val_doc_list_dirty]
test_doc_list = [re.sub(r'[^\w\s]','',s) for s in test_doc_list_dirty]

In [42]:
print('Lemmatizing and counting, this may take a few minutes...')
start_time = time.time()
vectorizer = CountVectorizer(input='content', analyzer='word', stop_words='english',
                             tokenizer=LemmaTokenizer(), max_df=0.9, min_df=3)

train_vectors = vectorizer.fit_transform(train_doc_list)
print('done_with_train')
val_vectors = vectorizer.transform(val_doc_list)
test_vectors = vectorizer.transform(test_doc_list)

vocab_list = vectorizer.get_feature_names()
vocab_size = len(vocab_list)
print('vocab size:', vocab_size)
print('Done. Time elapsed: {:.2f}s'.format(time.time() - start_time))

Lemmatizing and counting, this may take a few minutes...


  'stop_words.' % sorted(inconsistent))


done_with_train
vocab size: 35654
Done. Time elapsed: 973.57s


In [43]:
import scipy.sparse as sparse

def shuffle_and_dtype(vectors):
    idx = np.arange(vectors.shape[0])
    np.random.shuffle(idx)
    vectors = vectors[idx]
    vectors = sparse.csr_matrix(vectors, dtype=np.float32)
    print(type(vectors), vectors.dtype)
    return vectors

train_vectors = shuffle_and_dtype(train_vectors)
val_vectors = shuffle_and_dtype(val_vectors)
test_vectors = shuffle_and_dtype(test_vectors)

<class 'scipy.sparse.csr.csr_matrix'> float32
<class 'scipy.sparse.csr.csr_matrix'> float32
<class 'scipy.sparse.csr.csr_matrix'> float32


In [44]:
import os
import shutil

def check_create_dir(dir):
    if os.path.exists(dir):    # cleanup existing data folder
        shutil.rmtree(dir)
    os.mkdir(dir)

data_dir = os.getcwd()
check_create_dir(data_dir)
os.chdir(data_dir)
print('Current directory: ', os.getcwd())

Current directory:  /home/ec2-user/SageMaker/amazon-nlp


In [45]:
def split_convert(sparray, prefix, fname_template='data_part{}.pbr', n_parts=2):
    import io
    import sagemaker.amazon.common as smac

    chunk_size = sparray.shape[0] // n_parts
    for i in range(n_parts):

        # Calculate start and end indices
        start = i * chunk_size
        end = (i + 1) * chunk_size
        if i + 1 == n_parts:
            end = sparray.shape[0]

        # Convert to record protobuf
        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end], file=buf, labels=None)
        buf.seek(0)

        fname = os.path.join(prefix, fname_template.format(i))
        with open(fname, 'wb') as f:
            f.write(buf.getvalue())
        print('Saved data to {}'.format(fname))
        
train_data_dir = os.path.join(data_dir, 'train')
val_data_dir = os.path.join(data_dir, 'validation')
test_data_dir = os.path.join(data_dir, 'test')

check_create_dir(train_data_dir)
check_create_dir(val_data_dir)
check_create_dir(test_data_dir)

split_convert(train_vectors, prefix=train_data_dir, fname_template='train_part{}.pbr', n_parts=1)
split_convert(val_vectors, prefix=val_data_dir, fname_template='val_part{}.pbr', n_parts=10)
split_convert(test_vectors, prefix=test_data_dir, fname_template='test_part{}.pbr', n_parts=1)

Saved data to /home/ec2-user/SageMaker/amazon-nlp/train/train_part0.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part0.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part1.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part2.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part3.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part4.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part5.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part6.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part7.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part8.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/validation/val_part9.pbr
Saved data to /home/ec2-user/SageMaker/amazon-nlp/test/test_part0.pbr


In [46]:
aux_data_dir = os.path.join(data_dir,'auxiliary')
check_create_dir(aux_data_dir)
with open(os.path.join(aux_data_dir,'vocab.txt'),'w',encoding = 'utf-8') as f:
    for item in vocab_list:
        f.write(item+'\n')

In [47]:
import os
import sagemaker
role = sagemaker.get_execution_role()
bucket = bucket_name
prefix = 'ntm/' + 'amazon_review'
train_prefix = os.path.join(prefix, 'train')
val_prefix = os.path.join(prefix, 'val')
aux_prefix = os.path.join(prefix, 'auxiliary')
test_prefix = os.path.join(prefix, 'test')
output_prefix = os.path.join(prefix, 'output')
s3_train_data = os.path.join('s3://', bucket, train_prefix)
s3_val_data = os.path.join('s3://', bucket, val_prefix)
s3_aux_data = os.path.join('s3://', bucket, aux_prefix)
s3_test_data = os.path.join('s3://', bucket, test_prefix)
output_path = os.path.join('s3://', bucket, output_prefix)
print('Training set location', s3_train_data)
print('Validation set location', s3_val_data)
print('Auxiliary data location', s3_aux_data)
print('Test data location', s3_test_data)
print('Trained model will be saved at', output_path)

Training set location s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/train
Validation set location s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/val
Auxiliary data location s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/auxiliary
Test data location s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/test
Trained model will be saved at s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/output


In [48]:
import subprocess

cmd_train = 'aws s3 cp ' + train_data_dir + ' ' + s3_train_data + ' --recursive' 
p=subprocess.Popen(cmd_train, shell=True,stdout=subprocess.PIPE)
p.communicate()

(b'Completed 256.0 KiB/21.2 MiB (2.2 MiB/s) with 1 file(s) remaining\rCompleted 512.0 KiB/21.2 MiB (4.4 MiB/s) with 1 file(s) remaining\rCompleted 768.0 KiB/21.2 MiB (6.5 MiB/s) with 1 file(s) remaining\rCompleted 1.0 MiB/21.2 MiB (8.6 MiB/s) with 1 file(s) remaining  \rCompleted 1.2 MiB/21.2 MiB (10.6 MiB/s) with 1 file(s) remaining \rCompleted 1.5 MiB/21.2 MiB (12.7 MiB/s) with 1 file(s) remaining \rCompleted 1.8 MiB/21.2 MiB (14.7 MiB/s) with 1 file(s) remaining \rCompleted 2.0 MiB/21.2 MiB (16.6 MiB/s) with 1 file(s) remaining \rCompleted 2.2 MiB/21.2 MiB (18.6 MiB/s) with 1 file(s) remaining \rCompleted 2.5 MiB/21.2 MiB (20.4 MiB/s) with 1 file(s) remaining \rCompleted 2.8 MiB/21.2 MiB (22.3 MiB/s) with 1 file(s) remaining \rCompleted 3.0 MiB/21.2 MiB (24.2 MiB/s) with 1 file(s) remaining \rCompleted 3.2 MiB/21.2 MiB (25.9 MiB/s) with 1 file(s) remaining \rCompleted 3.5 MiB/21.2 MiB (27.6 MiB/s) with 1 file(s) remaining \rCompleted 3.8 MiB/21.2 MiB (29.3 MiB/s) with 1 file(s) rema

In [49]:
cmd_val = 'aws s3 cp ' + val_data_dir + ' ' + s3_val_data + ' --recursive' 
p=subprocess.Popen(cmd_val, shell=True,stdout=subprocess.PIPE)
p.communicate()

(b'Completed 256.0 KiB/358.9 MiB (2.7 MiB/s) with 10 file(s) remaining\rCompleted 512.0 KiB/358.9 MiB (5.2 MiB/s) with 10 file(s) remaining\rCompleted 768.0 KiB/358.9 MiB (7.7 MiB/s) with 10 file(s) remaining\rCompleted 1.0 MiB/358.9 MiB (10.1 MiB/s) with 10 file(s) remaining \rCompleted 1.2 MiB/358.9 MiB (12.4 MiB/s) with 10 file(s) remaining \rCompleted 1.5 MiB/358.9 MiB (14.8 MiB/s) with 10 file(s) remaining \rCompleted 1.8 MiB/358.9 MiB (17.0 MiB/s) with 10 file(s) remaining \rCompleted 2.0 MiB/358.9 MiB (19.0 MiB/s) with 10 file(s) remaining \rCompleted 2.2 MiB/358.9 MiB (21.3 MiB/s) with 10 file(s) remaining \rCompleted 2.5 MiB/358.9 MiB (23.3 MiB/s) with 10 file(s) remaining \rCompleted 2.8 MiB/358.9 MiB (25.2 MiB/s) with 10 file(s) remaining \rCompleted 3.0 MiB/358.9 MiB (27.1 MiB/s) with 10 file(s) remaining \rCompleted 3.2 MiB/358.9 MiB (29.3 MiB/s) with 10 file(s) remaining \rCompleted 3.5 MiB/358.9 MiB (31.1 MiB/s) with 10 file(s) remaining \rCompleted 3.8 MiB/358.9 MiB (32

In [50]:
cmd_test = 'aws s3 cp ' + test_data_dir + ' ' + s3_test_data + ' --recursive' 
p=subprocess.Popen(cmd_test, shell=True,stdout=subprocess.PIPE)
p.communicate()

(b'Completed 256.0 KiB/82.0 MiB (2.1 MiB/s) with 1 file(s) remaining\rCompleted 512.0 KiB/82.0 MiB (4.1 MiB/s) with 1 file(s) remaining\rCompleted 768.0 KiB/82.0 MiB (6.1 MiB/s) with 1 file(s) remaining\rCompleted 1.0 MiB/82.0 MiB (7.9 MiB/s) with 1 file(s) remaining  \rCompleted 1.2 MiB/82.0 MiB (9.9 MiB/s) with 1 file(s) remaining  \rCompleted 1.5 MiB/82.0 MiB (11.6 MiB/s) with 1 file(s) remaining \rCompleted 1.8 MiB/82.0 MiB (13.5 MiB/s) with 1 file(s) remaining \rCompleted 2.0 MiB/82.0 MiB (15.4 MiB/s) with 1 file(s) remaining \rCompleted 2.2 MiB/82.0 MiB (16.7 MiB/s) with 1 file(s) remaining \rCompleted 2.5 MiB/82.0 MiB (18.2 MiB/s) with 1 file(s) remaining \rCompleted 2.8 MiB/82.0 MiB (19.7 MiB/s) with 1 file(s) remaining \rCompleted 3.0 MiB/82.0 MiB (21.4 MiB/s) with 1 file(s) remaining \rCompleted 3.2 MiB/82.0 MiB (22.6 MiB/s) with 1 file(s) remaining \rCompleted 3.5 MiB/82.0 MiB (24.2 MiB/s) with 1 file(s) remaining \rCompleted 3.8 MiB/82.0 MiB (25.6 MiB/s) with 1 file(s) rema

In [51]:
cmd_aux = 'aws s3 cp ' + aux_data_dir + ' ' + s3_aux_data + ' --recursive' 
p=subprocess.Popen(cmd_aux, shell=True,stdout=subprocess.PIPE)
p.communicate()

(b'Completed 256.0 KiB/288.4 KiB (4.9 MiB/s) with 1 file(s) remaining\rCompleted 288.4 KiB/288.4 KiB (2.4 MiB/s) with 1 file(s) remaining\rupload: auxiliary/vocab.txt to s3://e15-nlp-amazon-review-polarity/ntm/amazon_review/auxiliary/vocab.txt\n',
 None)

In [52]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name,'ntm')

In [60]:
import sagemaker
sess = sagemaker.Session()
ntm = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count = 1,
                                   train_instance_type = 'ml.p3.2xlarge',
                                   output_path = output_path,
                                   sagemaker_session = sess)

In [115]:
num_topics = 12
ntm.set_hyperparameters(num_topics = num_topics, feature_dim = vocab_size, mini_batch_size = 60, epochs = 50, sub_sample = 1.0)

In [116]:
from sagemaker.session import s3_input
s3_train = s3_input(s3_train_data, distribution='ShardedByS3Key',
                    content_type='application/x-recordio-protobuf')
s3_val = s3_input(s3_val_data, distribution='FullyReplicated',
                  content_type='application/x-recordio-protobuf')
s3_test = s3_input(s3_test_data, distribution='FullyReplicated',
                  content_type='application/x-recordio-protobuf')

s3_aux = s3_input(s3_aux_data, distribution='FullyReplicated', content_type='text/plain')

In [None]:
ntm.fit({'train' : s3_train,'validation' : s3_val,'auxiliary' : s3_aux, 'test' : s3_test})

2019-08-02 19:55:13 Starting - Starting the training job...
2019-08-02 19:55:15 Starting - Launching requested ML instances...
2019-08-02 19:55:46 Starting - Insufficient capacity error from EC2 while launching instances, retrying!......
2019-08-02 19:56:45 Starting - Preparing the instances for training...
2019-08-02 19:57:42 Downloading - Downloading input data...
2019-08-02 19:57:53 Training - Downloading the training image.
[31mDocker entrypoint called with argument(s): train[0m
  from numpy.testing import nosetester[0m
[31m[08/02/2019 19:58:19 INFO 140227500922688] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'num_patience_epochs': u'3', u'clip_gradient': u'Inf', u'encoder_layers': u'auto', u'optimizer': u'adadelta', u'_kvstore': u'auto_gpu', u'rescale_gradient': u'1.0', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'learning_rate': u'0.01', u'_data_format': u'record', u'sub_sample': u'1.0', u'epochs': u

In [58]:
print('Training job name : {}'.format(ntm.latest_training_job.job_name))

Training job name : ntm-2019-08-02-18-17-28-202


In [71]:
ntm_predictor2 = ntm.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

-------------------------------------------------------------------------------------------------------!

In [65]:
from sagemaker.predictor import csv_serializer, json_deserializer

In [66]:
ntm_predictor.content_type = 'text/csv'
ntm_predictor.serializer = csv_serializer
ntm_predictor.deserializer = json_deserializer

In [86]:
ntm_predictor2.content_type = 'text/csv'
ntm_predictor2.serializer = csv_serializer
ntm_predictor2.deserializer = json_deserializer

In [110]:
def make_prediction(review, topic_list=None):
    review_vec = vectorizer.transform([review]).toarray()
    results = ntm_predictor.predict(review_vec)
    predictions = np.array([prediction['topic_weights'] for prediction in results['predictions']])[0]
    
    if not topic_list:
        topic_list = list(range(len(predictions)))
    
    return pd.DataFrame({'Topic':topic_list, 'Confidence':predictions})

In [113]:
topic_list = ['Rock Music', 'Software', 'Music', 'Books', 'Kitchen/Bathroom', 'Maintenance', 'Wut', 'Music']

review = "My battery died."

make_prediction(review)

Unnamed: 0,Topic,Confidence
0,0,0.043943
1,1,0.033241
2,2,0.041997
3,3,0.041366
4,4,0.043546
5,5,0.040585
6,6,0.038901
7,7,0.040161
8,8,0.043018
9,9,0.044864
