In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import adjusted_rand_score
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from scipy.stats import multivariate_normal as mvn
import os
from os import path
from PIL import Image
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
df = pd.read_csv("SampleReviewData.csv")

In [3]:
df

Unnamed: 0,No.,No. 2,No. 3,Time,Review text,Rater 1 - Valence,Rater 1 - Arousal,Rater 2 - Valence,Rater 2 - Arousal
0,1,158674,247,10/27/11,The best ramen I've ever had. EVER. All of the...,9,7,7,6
1,2,31960,277,2/18/12,"cheap, reliable, filling and open late. Dont c...",8,7,9,8
2,3,68445,555,5/13/13,I really don't understand why this place is ra...,2,4,1,5
3,4,99410,247,4/22/13,Thank god they take down your phone number so ...,5,6,7,8
4,5,90106,452,3/23/13,We saw the B rating on the door...we should ha...,5,7,3,6
...,...,...,...,...,...,...,...,...,...
1995,1996,88406,443,7/17/14,"Delicious, fresh and carefully prepared food, ...",8,7,8,6
1996,1997,7776,27,3/6/13,This place is great. A perfect and tasty veget...,8,6,8,6
1997,1998,22754,345,8/29/14,Overall a great place. The snacks are sized fo...,9,6,7,4
1998,1999,15147,662,2/21/12,OMG G-O-O-D!!! I have to give credit where cre...,9,9,9,9


In [7]:
df = df.drop(['No.', 'No. 2', 'No. 3'], axis=1, inplace =  False)

In [9]:
#Rename columns
df = df.rename({'Review text': 'review_text', 'Rater 1 - Valence': 'rater1_valence', 'Rater 1 - Arousal': 'rater1_arousal', 'Rater 2 - Valence':'rater2_valence', 'Rater 2 - Arousal': 'rater2_arousal'}, axis=1, inplace=False)

In [14]:
df

Unnamed: 0,Time,review_text,rater1_valence,rater1_arousal,rater2_valence,rater2_arousal
0,10/27/11,The best ramen I've ever had. EVER. All of the...,9,7,7,6
1,2/18/12,"cheap, reliable, filling and open late. Dont c...",8,7,9,8
2,5/13/13,I really don't understand why this place is ra...,2,4,1,5
3,4/22/13,Thank god they take down your phone number so ...,5,6,7,8
4,3/23/13,We saw the B rating on the door...we should ha...,5,7,3,6
...,...,...,...,...,...,...
1995,7/17/14,"Delicious, fresh and carefully prepared food, ...",8,7,8,6
1996,3/6/13,This place is great. A perfect and tasty veget...,8,6,8,6
1997,8/29/14,Overall a great place. The snacks are sized fo...,9,6,7,4
1998,2/21/12,OMG G-O-O-D!!! I have to give credit where cre...,9,9,9,9


In [12]:
df1

0       The best ramen I've ever had. EVER. All of the...
1       cheap, reliable, filling and open late. Dont c...
2       I really don't understand why this place is ra...
3       Thank god they take down your phone number so ...
4       We saw the B rating on the door...we should ha...
                              ...                        
1995    Delicious, fresh and carefully prepared food, ...
1996    This place is great. A perfect and tasty veget...
1997    Overall a great place. The snacks are sized fo...
1998    OMG G-O-O-D!!! I have to give credit where cre...
1999    Best brunch ever! Thanks to my dear friend who...
Name: review_text, Length: 2000, dtype: object

# Sentiment Analysis

In [15]:
# Remove any 'neutral' ratings equal to 5
df = df[df['rater1_valence'] != 5]

In [17]:
# Encode r>=6 as 1 (rated positively)
# Encode r<6 as 0 (rated poorly)
df['Positively Rated'] = np.where(df['rater1_valence'] > 5, 1, 0)
df.head(10)

Unnamed: 0,Time,review_text,rater1_valence,rater1_arousal,rater2_valence,rater2_arousal,Positively Rated
0,10/27/11,The best ramen I've ever had. EVER. All of the...,9,7,7,6,1
1,2/18/12,"cheap, reliable, filling and open late. Dont c...",8,7,9,8,1
2,5/13/13,I really don't understand why this place is ra...,2,4,1,5,0
5,6/25/13,this place was great. I've tried about every ...,8,6,7,5,1
6,1/6/12,The pizza was okay. Not anywhere near what th...,7,5,5,4,1
7,8/11/13,I don't think I can ever have any other burger...,9,9,8,8,1
8,1/11/11,"Worth every dime. Great staff, great food, gre...",9,7,7,6,1
9,5/9/11,"A very fun ambiance, the place is always jam p...",7,8,8,6,1
10,7/16/14,"Very nice pizza slice, price $ 4 or 4.50 but I...",4,5,6,3,0
11,6/6/12,"Big fan, beautiful space, very sweet manager, ...",9,5,6,7,1


In [18]:
# Most ratings are positive
df['Positively Rated'].mean()

0.8664202745512144

In [19]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review_text'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [20]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:



X_train shape:  (1420,)


# CountVectorizer

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [22]:
vect.get_feature_names()[::2000]

['00', 'concentrated', 'grilling', 'nyc', 'situated', 'zha']

In [23]:
len(vect.get_feature_names())

10016

In [24]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<1420x10016 sparse matrix of type '<class 'numpy.int64'>'
	with 109172 stored elements in Compressed Sparse Row format>

In [25]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression()

In [26]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7436565867222802


In [27]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['bad' 'rude' 'way' 'overpriced' 'ok' 'other' 'not' 'rated' 'disappointed'
 'won']

Largest Coefs: 
['delicious' 'great' 'amazing' 'little' 'excellent' 'say' 'best' 'tasty'
 'super' 'fantastic']


In [28]:
#Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

2485

In [43]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7119105549762486


In [30]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['heirloom' 'cover' 'executed' 'heavily' 'sucker' 'slight' 'swear'
 'palate' 'shredded' 'heading']

Largest tfidf: 
['delicious' 'amazing' 'beer' 'sushi' 'pizza' 'bad' 'yum' 'star' 'crepe'
 'spicy']


In [31]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'bad' 'overpriced' 'rude' 'way' 'ok' 'he' 'was' 'terrible' 'us']

Largest Coefs: 
['great' 'delicious' 'and' 'amazing' 'excellent' 'best' 'perfect'
 'definitely' 'try' 'love']


In [33]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(["not a good place recommend it",
                                    'they offer a wonderful service'])))

[1 1]


# n-grams

In [39]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

6971

In [40]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.7119105549762486


In [36]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['bad' 'rude' 'way' 'overpriced' 'nothing special' 'not' 'is not' 'other'
 'terrible' 'nothing']

Largest Coefs: 
['delicious' 'great' 'amazing' 'excellent' 'to the' 'that the' 'little'
 'say' 'best' 'tasty']
