### Read datafile as CSV

In [3]:
import pandas as pd

df = pd.read_csv("s3://sagemaker-us-east-1-023375022819/NLP_AWS/NLP_AWS/train/train.csv", names=["Label", "Title", "Review"])

In [4]:
neg_df = df[df['Label'] == 1]

In [5]:
neg_df.head()

Unnamed: 0,Label,Title,Review
6,1,Buyer beware,"This is a self-published book, and if you want..."
10,1,The Worst!,A complete waste of time. Typographical errors...
13,1,Oh please,I guess you have to be a romance novel lover f...
14,1,Awful beyond belief!,I feel I have to write to keep others from was...
15,1,Don't try to fool us with fake reviews.,It's glaringly obvious that all of the glowing...


In [6]:
neg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800000 entries, 6 to 3599998
Data columns (total 3 columns):
Label     int64
Title     object
Review    object
dtypes: int64(1), object(2)
memory usage: 54.9+ MB


In [7]:
extract_df = neg_df.iloc[:,0:10000]['Review']

In [8]:
extract_df.head(10)

6     This is a self-published book, and if you want...
10    A complete waste of time. Typographical errors...
13    I guess you have to be a romance novel lover f...
14    I feel I have to write to keep others from was...
15    It's glaringly obvious that all of the glowing...
19    sizes are much smaller than what is recomended...
20    This model may be ok for sedentary types, but ...
22    Rather than scratches and insect droppings, th...
25    I have had the charger for more than two years...
26    I bought one of these chargers..the instructio...
Name: Review, dtype: object

### Deploy Neutral Topic Modeling

In [10]:
short_df = extract_df.head(10000)

In [11]:
short_df.count()

10000

In [12]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import re
token_pattern = re.compile(r"(?u)\b\w\w+\b")
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(t) >= 2 and re.match("[a-z].*",t) 
                and re.match(token_pattern, t)]

[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
import time
import numpy as np

In [15]:
!conda install -y scipy

Solving environment: done


  current version: 4.5.12
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
vocab_size = 2000
print('Tokenizing and counting, this may take a few minutes...')
start_time = time.time()

Tokenizing and counting, this may take a few minutes...


In [18]:
vectorizer = CountVectorizer(input='content', analyzer='word', stop_words='english',
                             tokenizer=LemmaTokenizer(), max_features=vocab_size, max_df=0.95, min_df=2)
vectors = vectorizer.fit_transform(short_df)
vocab_list = vectorizer.get_feature_names()
print('vocab size:', len(vocab_list))

# random shuffle
idx = np.arange(vectors.shape[0])
np.random.shuffle(idx)
vectors = vectors[idx]

print('Done. Time elapsed: {:.2f}s'.format(time.time() - start_time))

  'stop_words.' % sorted(inconsistent))


vocab size: 2000
Done. Time elapsed: 37.62s


In [20]:
print('vocab size:', vocab_list)



In [21]:
threshold = 25
vectors = vectors[np.array(vectors.sum(axis=1)>threshold).reshape(-1,)]
print('removed short docs (<{} words)'.format(threshold))        
print(vectors.shape)

print(type(vectors), vectors.dtype)
print(vectors[0])

removed short docs (<25 words)
(4463, 2000)
<class 'scipy.sparse.csr.csr_matrix'> int64
  (0, 1975)	2
  (0, 1922)	1
  (0, 1791)	1
  (0, 1901)	1
  (0, 753)	1
  (0, 1110)	1
  (0, 1536)	1
  (0, 641)	1
  (0, 1516)	1
  (0, 153)	1
  (0, 982)	1
  (0, 1126)	4
  (0, 224)	1
  (0, 548)	1
  (0, 1698)	1
  (0, 185)	1
  (0, 1927)	1
  (0, 634)	1
  (0, 422)	1
  (0, 1603)	1
  (0, 1885)	2


In [22]:
import scipy.sparse as sparse
vectors = sparse.csr_matrix(vectors, dtype=np.float32)
print(type(vectors), vectors.dtype)

<class 'scipy.sparse.csr.csr_matrix'> float32
