In [1]:
import boto3
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()
role = get_execution_role()
region_name = boto3.Session().region_name

s3 = boto3.client('s3')

In [2]:
container = get_image_uri(region_name, 'ntm')

bucket_name = 'e15-nlp-amazon-review-polarity'
train_data_key = 'train.csv'
test_data_key = 'test.csv'

train_data = pd.read_csv(s3.get_object(Bucket = bucket_name,
                                Key = train_data_key)['Body'], 
                                header = None, 
                                names = ['class_label','review_title',
                                        'review_text'])
train_data.shape

(3600000, 3)

In [3]:
train_data = train_data[train_data['class_label'] == 1]

In [4]:
num_topics = 5
feature_dim = 2

In [5]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re
token_pattern = re.compile(r"(?u)\b\w\w+\b")

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in doc.split() if len(t) >= 2 and re.match("[a-z].*",t) 
                and re.match(token_pattern, t)]

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import gc

train_data = train_data['review_text'].tolist()

train_doc_list, test_doc_list = train_test_split(train_data)
test_doc_list, val_doc_list = train_test_split(test_doc_list)
del train_data
gc.collect()

245

In [None]:
print('Lemmatizing and counting, this may take a few minutes...')
start_time = time.time()
vectorizer = CountVectorizer(input='content', analyzer='word', stop_words='english',
                             tokenizer=LemmaTokenizer(), max_df=0.9, min_df=3)

train_vectors = vectorizer.fit_transform(train_doc_list)
print('done_with_train')
val_vectors = vectorizer.transform(val_doc_list)
test_vectors = vectorizer.transform(test_doc_list)

vocab_list = vectorizer.get_feature_names()
vocab_size = len(vocab_list)
print('vocab size:', vocab_size)
print('Done. Time elapsed: {:.2f}s'.format(time.time() - start_time))

Lemmatizing and counting, this may take a few minutes...


  'stop_words.' % sorted(inconsistent))
