In [59]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import ssl
import re

import gensim.downloader as api

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

1. import and prepare data, review, clean

In [3]:
df = pd.read_csv('Reviews.csv')
df.head(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [4]:
df = df[['Summary', 'Score']]
df

Unnamed: 0,Summary,Score
0,Good Quality Dog Food,5
1,Not as Advertised,1
2,"""Delight"" says it all",4
3,Cough Medicine,2
4,Great taffy,5
...,...,...
568449,Will not do without,5
568450,disappointed,2
568451,Perfect for our maltipoo,5
568452,Favorite Training and reward treat,5


df = df.dropna()

remove stopwords

In [9]:
df['filtered'] = df['Summary'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords.words('english')]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filtered'] = df['Summary'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords.words('english')]))


In [11]:
df

Unnamed: 0,Summary,Score,filtered
0,Good Quality Dog Food,5,Good Quality Dog Food
1,Not as Advertised,1,Advertised
2,"""Delight"" says it all",4,"""Delight"" says"
3,Cough Medicine,2,Cough Medicine
4,Great taffy,5,Great taffy
...,...,...,...
568449,Will not do without,5,without
568450,disappointed,2,disappointed
568451,Perfect for our maltipoo,5,Perfect maltipoo
568452,Favorite Training and reward treat,5,Favorite Training reward treat


stemming

In [14]:
df['filtered'] = df['filtered'].apply(lambda x: ' '.join([word for word in x.split() if PorterStemmer().stem(word)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filtered'] = df['filtered'].apply(lambda x: ' '.join([word for word in x.split() if PorterStemmer().stem(word)]))


In [15]:
df

Unnamed: 0,Summary,Score,filtered
0,Good Quality Dog Food,5,Good Quality Dog Food
1,Not as Advertised,1,Advertised
2,"""Delight"" says it all",4,"""Delight"" says"
3,Cough Medicine,2,Cough Medicine
4,Great taffy,5,Great taffy
...,...,...,...
568449,Will not do without,5,without
568450,disappointed,2,disappointed
568451,Perfect for our maltipoo,5,Perfect maltipoo
568452,Favorite Training and reward treat,5,Favorite Training reward treat


remove punctuation 

In [23]:
df['filtered'] = df['filtered'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filtered'] = df['filtered'].apply(lambda x: re.sub('[^a-zA-Z0-9]', ' ', x.lower()))


In [24]:
df

Unnamed: 0,Summary,Score,filtered
0,Good Quality Dog Food,5,good quality dog food
1,Not as Advertised,1,advertised
2,"""Delight"" says it all",4,delight says
3,Cough Medicine,2,cough medicine
4,Great taffy,5,great taffy
...,...,...,...
568449,Will not do without,5,without
568450,disappointed,2,disappointed
568451,Perfect for our maltipoo,5,perfect maltipoo
568452,Favorite Training and reward treat,5,favorite training reward treat


In [26]:
def sentiment(score):
    if score > 3:
        return 'positive'
    else:
        return 'negative'
    
df['Sentiment'] = df['Score'].apply(sentiment)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment'] = df['Score'].apply(sentiment)


Unnamed: 0,Summary,Score,filtered,Sentiment
0,Good Quality Dog Food,5,good quality dog food,positive
1,Not as Advertised,1,advertised,negative
2,"""Delight"" says it all",4,delight says,positive
3,Cough Medicine,2,cough medicine,negative
4,Great taffy,5,great taffy,positive
...,...,...,...,...
568449,Will not do without,5,without,positive
568450,disappointed,2,disappointed,negative
568451,Perfect for our maltipoo,5,perfect maltipoo,positive
568452,Favorite Training and reward treat,5,favorite training reward treat,positive


2. Data preprocessing with unigram, bigram or tfidf method

In [30]:
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
matrix = vectorizer.fit_transform(df['filtered'])
"""
in matrix, row is for each line in filtered, and column is for each unique feature that appears in the line
"""

here is an example of what the matrix would look like for first three rows

In [36]:
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
matrix = vectorizer.fit_transform(df['filtered'][0:3])
print(df['filtered'][0:3])
print(matrix)
print(matrix.toarray())

0    good quality dog food
1               advertised
2            delight  says
Name: filtered, dtype: object
  (0, 4)	0.37796447300922725
  (0, 9)	0.37796447300922725
  (0, 7)	0.37796447300922725
  (0, 5)	0.37796447300922725
  (0, 3)	0.37796447300922725
  (0, 8)	0.37796447300922725
  (0, 6)	0.37796447300922725
  (1, 0)	1.0
  (2, 2)	0.5773502691896257
  (2, 10)	0.5773502691896257
  (2, 1)	0.5773502691896257
[[0.         0.         0.         0.37796447 0.37796447 0.37796447
  0.37796447 0.37796447 0.37796447 0.37796447 0.        ]
 [1.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.57735027 0.57735027 0.         0.         0.
  0.         0.         0.         0.         0.57735027]]


In [39]:
model = api.load("word2vec-google-news-300")

[--------------------------------------------------] 1.4% 23.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 6.1% 101.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[====----------------------------------------------] 9.9% 165.2/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






In [50]:
def sumToVec(summary, model):
    words = summary.split()
    vectors = [model[word] for word in words if word in model.key_to_index]
    if len(vectors) > 0:
        return np.mean(vectors, axis = 0)
    else:
        return np.zeros(model.vector_size)

word_vectors = np.array([sumToVec(summary, model=model) for summary in df['filtered']])
word_vectors

array([[-0.08740234,  0.11917114, -0.05917358, ...,  0.05981445,
        -0.05622578,  0.03524017],
       [-0.04443359, -0.01745605, -0.10986328, ..., -0.28125   ,
        -0.0703125 , -0.26953125],
       [ 0.12194824, -0.14501953,  0.14453125, ...,  0.21630859,
         0.13378906, -0.05081177],
       ...,
       [ 0.00445557, -0.06591797, -0.12915039, ...,  0.02392578,
        -0.03546143,  0.0838623 ],
       [-0.06140137,  0.06338501,  0.13397217, ...,  0.01080322,
         0.08422852, -0.01985168],
       [ 0.03361511,  0.04101562, -0.09869385, ..., -0.08501434,
         0.09405518, -0.07481384]])

In [53]:
len(word_vectors) == len(df['Sentiment'])

True

3. train a model

In [55]:
X = word_vectors
y = np.where(df['Score'] > 3, 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [60]:
print(y_pred)
print(y_test)

[1 1 1 ... 0 1 1]
[1 1 1 ... 0 1 1]


4. evaluate the model

In [63]:
acc = accuracy_score(y_test, y_pred)
acc

0.8960294143518112

In [65]:
roc_auc = roc_auc_score(y_test, y_pred)
roc_auc

0.7928221145468539

5. implement trained model for real life date

In [85]:
new_data = ['This dog food is excellent', 'terrible experience', 'disappointed food', 'disappointed cat food', 'disappointed disappointed cat food']

W = np.array([sumToVec(summary, model=model) for summary in new_data])
W

newy_pred = rfc.predict(W)
newy_pred

array([1, 0, 0, 1, 0])

To summarize, The model 'rfc' trained is accurate for given data. However, it is noticable that it is not sensitive to certain words, for example 'cat food', because in the training process, cat food was described as positive, which means there is a good chance that when the model encounter the word cat, it will predict the sentiment to be positive because the positive weight it holds is noticable. 

To improve upon this model, we can add some steps in data cleaning process where we strictly filter out unnecessary words, specifically those that are not adjectives. At the same time, there are also tricky cases that contribute to the unprecise of the model. For example, "Will not do without" will be filted out to "without", which is a waste of data. 

In general, the model is accurate for the amazon review purpose, but there is room for further improvement.