In [47]:
import pandas as pd
import numpy as np
import string
import re

In [48]:
df_reviews = pd.read_csv('final_restaurant_review_data.csv', lineterminator='\n')

In [49]:
df_reviews['full_review']

0         Actually ordered online with Deliveroo! disapp...
1         Hate to write bad reviews but.... Bad service,...
2         Tempting pizzas I went with a couple a family ...
3         Quick lunch pizza Great pizza. Easy to order a...
4         Nice italian thin crust pizzas Good for casual...
                                ...                        
400936    Poor service and food is below average. Wanted...
400937    Questionable Service Very poor service indeed....
400938    Terrible Experience Waited for 2 hours for my ...
400939    Not worth it. Order via deliveroo. Will be my ...
400940    A genuine review.. Guys, never ever order onli...
Name: full_review, Length: 400941, dtype: object

#### Converting full_review column to str for pre-processing

In [50]:
df_reviews['full_review'] = df_reviews['full_review'].astype(str)
df_reviews['full_review'].head()

0    Actually ordered online with Deliveroo! disapp...
1    Hate to write bad reviews but.... Bad service,...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza. Easy to order a...
4    Nice italian thin crust pizzas Good for casual...
Name: full_review, dtype: object

### Text Pre-processing

#### Punctuation removal
##### All the punctuations from the text are removed. string library of Python contains some pre-defined list of punctuations such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’

In [15]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# function to remove punctuation
def remove_punctuation(review):
    review_without_punctuation = "".join([i for i in review if i not in string.punctuation])
    return review_without_punctuation

df_reviews['cleaned_review'] = df_reviews['full_review'].apply(lambda x: remove_punctuation(x))
df_reviews.head()

Unnamed: 0,rating,date,title,description,date_of_visit,url,num_of_img_uploaded,full_review,num_of_tokens_title,num_of_tokens_description,num_of_tokens_full_review,title_sentiment,description_sentiment,full_review_sentiment,review_sentiment_category
0,3.0,2021-12-04,Actually ordered online with Deliveroo! disapp...,Ordered a chicken Cobb salad which is meant to...,2021-12-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Actually ordered online with Deliveroo disappo...,6.0,61.0,67.0,-0.3,-0.3625,-0.3625,0.0
1,1.0,2021-01-26,Hate to write bad reviews but....,"Bad service, bad attitude of owner and average...",2021-01-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Hate to write bad reviews but Bad service bad ...,6.0,16.0,22.0,-0.75,-0.13,-0.13,0.0
2,4.0,2020-08-21,Tempting pizzas,I went with a couple a family friend pair who ...,2020-03-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Tempting pizzas I went with a couple a family ...,2.0,34.0,36.0,0.0,0.566667,0.566667,1.0
3,5.0,2019-11-09,Quick lunch pizza,Great pizza. Easy to order and delivery was ri...,2019-11-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,1.0,Quick lunch pizza Great pizza Easy to order an...,3.0,19.0,22.0,0.333333,0.479762,0.479762,1.0
4,3.0,2019-11-06,Nice italian thin crust pizzas,Good for casual dining with friends. Tasty ita...,2019-10-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Nice italian thin crust pizzas Good for casual...,5.0,21.0,26.0,0.066667,0.15,0.15,1.0


#### Or we can remove punctuations manually

In [51]:
df_reviews['cleaned_review'] = df_reviews['full_review'].apply(lambda x: ' '.join(re.sub("[.,!?:;-='...@#_]", " ", x).split()))
df_reviews['cleaned_review'].head()

0    Actually ordered online with Deliveroo disappo...
1    Hate to write bad reviews but Bad service bad ...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza Easy to order an...
4    Nice italian thin crust pizzas Good for casual...
Name: cleaned_review, dtype: object

#### Removing numbers as they do not hold information

In [52]:
df_reviews['cleaned_review'].replace('\d', '', regex=True, inplace=True)
df_reviews['cleaned_review'].head()

0    Actually ordered online with Deliveroo disappo...
1    Hate to write bad reviews but Bad service bad ...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza Easy to order an...
4    Nice italian thin crust pizzas Good for casual...
Name: cleaned_review, dtype: object

#### Removing emojis as they do not hold information

In [53]:
def remove_emojis(review):
    return review.encode('ascii', 'ignore').decode('ascii')
df_reviews['cleaned_review'] = df_reviews['cleaned_review'].apply(lambda x: remove_emojis(x))
df_reviews['cleaned_review'].head()

0    Actually ordered online with Deliveroo disappo...
1    Hate to write bad reviews but Bad service bad ...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza Easy to order an...
4    Nice italian thin crust pizzas Good for casual...
Name: cleaned_review, dtype: object

#### Removing nltk english stopwords

In [54]:
# import nltk library
import nltk
nltk.download('stopwords')
# import stopwords fron nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

# function to remove nltk english stopwords 
def remove_stopwords(review):
    review_words = review.split()
    noise_free_words = [word for word in review_words if word not in stopwords]
    noise_free_review = " ".join(noise_free_words)
    return noise_free_review

df_reviews['cleaned_sw_review'] = df_reviews['cleaned_review'].apply(lambda x: remove_stopwords(x))
df_reviews[['cleaned_review', 'cleaned_sw_review']]

[nltk_data] Downloading package stopwords to /Users/dylan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,cleaned_review,cleaned_sw_review
0,Actually ordered online with Deliveroo disappo...,Actually ordered online Deliveroo disappointin...
1,Hate to write bad reviews but Bad service bad ...,Hate write bad reviews Bad service bad attitud...
2,Tempting pizzas I went with a couple a family ...,Tempting pizzas I went couple family friend pa...
3,Quick lunch pizza Great pizza Easy to order an...,Quick lunch pizza Great pizza Easy order deliv...
4,Nice italian thin crust pizzas Good for casual...,Nice italian thin crust pizzas Good casual din...
...,...,...
400936,Poor service and food is below average Wanted ...,Poor service food average Wanted try indian cu...
400937,Questionable Service Very poor service indeed ...,Questionable Service Very poor service indeed ...
400938,Terrible Experience Waited for hours for my f...,Terrible Experience Waited hours food delivery...
400939,Not worth it Order via deliveroo Will be my fi...,Not worth Order via deliveroo Will first last ...


#### Stemming

In [55]:
# string split for Stemming
df_reviews['cleaned_stem_review'] = df_reviews['cleaned_sw_review'].apply(lambda x: x.split())

#importing the Stemming function from nltk library
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer(language='english')

# function for snowball stemming
def snowball_stemming(review):
    stem_review = [snowball_stemmer.stem(word) for word in review]
    return stem_review

df_reviews['cleaned_stem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: snowball_stemming(x))
df_reviews[['cleaned_review', 'cleaned_sw_review', 'cleaned_stem_review']]

Unnamed: 0,cleaned_review,cleaned_sw_review,cleaned_stem_review
0,Actually ordered online with Deliveroo disappo...,Actually ordered online Deliveroo disappointin...,"[actual, order, onlin, deliveroo, disappoint, ..."
1,Hate to write bad reviews but Bad service bad ...,Hate write bad reviews Bad service bad attitud...,"[hate, write, bad, review, bad, servic, bad, a..."
2,Tempting pizzas I went with a couple a family ...,Tempting pizzas I went couple family friend pa...,"[tempt, pizza, i, went, coupl, famili, friend,..."
3,Quick lunch pizza Great pizza Easy to order an...,Quick lunch pizza Great pizza Easy order deliv...,"[quick, lunch, pizza, great, pizza, easi, orde..."
4,Nice italian thin crust pizzas Good for casual...,Nice italian thin crust pizzas Good casual din...,"[nice, italian, thin, crust, pizza, good, casu..."
...,...,...,...
400936,Poor service and food is below average Wanted ...,Poor service food average Wanted try indian cu...,"[poor, servic, food, averag, want, tri, indian..."
400937,Questionable Service Very poor service indeed ...,Questionable Service Very poor service indeed ...,"[question, servic, veri, poor, servic, inde, w..."
400938,Terrible Experience Waited for hours for my f...,Terrible Experience Waited hours food delivery...,"[terribl, experi, wait, hour, food, deliveri, ..."
400939,Not worth it Order via deliveroo Will be my fi...,Not worth Order via deliveroo Will first last ...,"[not, worth, order, via, deliveroo, will, firs..."


#### Lemmatization

In [59]:
# importing WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(review):
    lemm_review = [wordnet_lemmatizer.lemmatize(word) for word in review]
    return lemm_review

df_reviews['cleaned_lem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: lemmatizer(x))
df_reviews[['cleaned_sw_review', 'cleaned_stem_review', 'cleaned_lem_review']]


[nltk_data] Downloading package wordnet to /Users/dylan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,cleaned_sw_review,cleaned_stem_review,cleaned_lem_review
0,Actually ordered online Deliveroo disappointin...,"[actual, order, onlin, deliveroo, disappoint, ...","[actual, order, onlin, deliveroo, disappoint, ..."
1,Hate write bad reviews Bad service bad attitud...,"[hate, write, bad, review, bad, servic, bad, a...","[hate, write, bad, review, bad, servic, bad, a..."
2,Tempting pizzas I went couple family friend pa...,"[tempt, pizza, i, went, coupl, famili, friend,...","[tempt, pizza, i, went, coupl, famili, friend,..."
3,Quick lunch pizza Great pizza Easy order deliv...,"[quick, lunch, pizza, great, pizza, easi, orde...","[quick, lunch, pizza, great, pizza, easi, orde..."
4,Nice italian thin crust pizzas Good casual din...,"[nice, italian, thin, crust, pizza, good, casu...","[nice, italian, thin, crust, pizza, good, casu..."
...,...,...,...
400936,Poor service food average Wanted try indian cu...,"[poor, servic, food, averag, want, tri, indian...","[poor, servic, food, averag, want, tri, indian..."
400937,Questionable Service Very poor service indeed ...,"[question, servic, veri, poor, servic, inde, w...","[question, servic, veri, poor, servic, inde, w..."
400938,Terrible Experience Waited hours food delivery...,"[terribl, experi, wait, hour, food, deliveri, ...","[terribl, experi, wait, hour, food, deliveri, ..."
400939,Not worth Order via deliveroo Will first last ...,"[not, worth, order, via, deliveroo, will, firs...","[not, worth, order, via, deliveroo, will, firs..."


#### a) 
#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review  

In [110]:
df_reviews = df_reviews[df_reviews['review_sentiment_category'].notna()]
df_reviews['review_sentiment_category'].astype(int)
df_reviews['rating'].isnull().values.any()

False

In [111]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [132]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    predictions = nb.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

    # for i, j in enumerate(nb.classes_):
    #     coefficients = nb.coef_[i]
    #     weights = list(zip(vect.get_feature_names(), coefficients))
    #     print('Most Positive Coefficients:')
    #     print(sorted(weights,key=lambda x: -x[1])[:10])
    #     print('Most Negative Coefficients:')
    #     print(sorted(weights,key=lambda x: x[1])[:10])

In [124]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8831464807701901
Accuracy:  0.8785479124058463
Precision:  0.8845469561226508
Recall:  0.8785479124058463
Confusion Matrix:  [[ 7482  4206]
 [ 5533 62967]]
Most Positive Coefficients:
[('the', -2.87193141332296), ('and', -3.245440193787214), ('to', -3.836335628181656), ('was', -3.9601327336590533), ('of', -4.119155989391038), ('is', -4.1524402829814555), ('for', -4.189394082620948), ('food', -4.194227862354735), ('we', -4.4132936243422325), ('good', -4.4911345315831035)]
Most Negative Coefficients:
[('aaaaahhh', -16.673068161259952), ('aaaarrgghhh', -16.673068161259952), ('aaargh', -16.673068161259952), ('aaid', -16.673068161259952), ('aain', -16.673068161259952), ('aalloo', -16.673068161259952), ('aamchi', -16.673068161259952), ('aampapad', -16.673068161259952), ('aarea', -16.673068161259952), ('aasis', -16.673068161259952)]




IndexError: index 1 is out of bounds for axis 0 with size 1

In [133]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8831464807701901
Accuracy:  0.8785479124058463
Precision:  0.8845469561226508
Recall:  0.8785479124058463
Confusion Matrix:  [[ 7482  4206]
 [ 5533 62967]]


In [82]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8659805706589515
Accuracy:  0.8622736569062702
Precision:  0.867377302181875
Recall:  0.8622736569062702
Confusion Matrix:  [[  725 10963]
 [   81 68419]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_sw_review (cleaned reviews that have stop words removed)  

In [85]:
# define X and y
X = df_reviews['cleaned_sw_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [86]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.886363919788497
Accuracy:  0.8810420511797277
Precision:  0.8850284259871829
Recall:  0.8810420511797277
Confusion Matrix:  [[ 7374  4314]
 [ 5225 63275]]


In [87]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.869734249513643
Accuracy:  0.8650047388636704
Precision:  0.8667622785006162
Recall:  0.8650047388636704
Confusion Matrix:  [[ 1002 10686]
 [  139 68361]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_stem_review (cleaned reviews that have stop words removed and stemmatized)  

In [90]:
# pre-processing step to convert list column into text column
df_reviews['cleaned_stem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: ' '.join(x) )


# define X and y
X = df_reviews['cleaned_stem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [91]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8848237890956253
Accuracy:  0.8801815733027386
Precision:  0.8847095531969069
Recall:  0.8801815733027386
Confusion Matrix:  [[ 7396  4292]
 [ 5316 63184]]


In [92]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8686337107796678
Accuracy:  0.8642315558437671
Precision:  0.8673913957620113
Recall:  0.8642315558437671
Confusion Matrix:  [[  917 10771]
 [  116 68384]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_lem_review (cleaned reviews that have stop words removed, stemmatized and lemmatized)  

In [93]:
# pre-processing step to convert list column into text column
df_reviews['cleaned_lem_review'] = df_reviews['cleaned_lem_review'].apply(lambda x: ' '.join(x) )


# define X and y
X = df_reviews['cleaned_lem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [94]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8847084351773332
Accuracy:  0.8801691026088692
Precision:  0.8847387856390609
Recall:  0.8801691026088692
Confusion Matrix:  [[ 7400  4288]
 [ 5321 63179]]


In [95]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8687615353918292
Accuracy:  0.8643562627824612
Precision:  0.8671427114871177
Recall:  0.8643562627824612
Confusion Matrix:  [[  932 10756]
 [  121 68379]]


#### b) 
#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review 

In [134]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [139]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    lr = LogisticRegression(max_iter=3000)
    lr.fit(X_train_dtm, y_train)
    predictions = lr.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, lr.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

In [140]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9582979997007034
Accuracy:  0.932171896044296
Precision:  0.9299158669542914
Recall:  0.932171896044296
Confusion Matrix:  [[ 8388  3300]
 [ 2139 66361]]


In [141]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9419083902828354
Accuracy:  0.9320970718810795
Precision:  0.9288691646671952
Recall:  0.9320970718810795
Confusion Matrix:  [[ 7832  3856]
 [ 1589 66911]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_sw_review (cleaned reviews that have stop words removed)  

In [142]:
# define X and y
X = df_reviews['cleaned_sw_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [143]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.955077443008929
Accuracy:  0.9275203272310071
Precision:  0.9247740596181879
Recall:  0.9275203272310071
Confusion Matrix:  [[ 8115  3573]
 [ 2239 66261]]


In [144]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9389996009377962
Accuracy:  0.9290292811892054
Precision:  0.9254245403061127
Recall:  0.9290292811892054
Confusion Matrix:  [[ 7622  4066]
 [ 1625 66875]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_stem_review (cleaned reviews that have stop words removed and stemmatized)  

In [145]:
# define X and y
X = df_reviews['cleaned_stem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [146]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9470961989325086
Accuracy:  0.9249014815184317
Precision:  0.9216687702434239
Recall:  0.9249014815184317
Confusion Matrix:  [[ 7897  3791]
 [ 2231 66269]]


In [147]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9349216840425001
Accuracy:  0.9261859629869806
Precision:  0.922276077518517
Recall:  0.9261859629869806
Confusion Matrix:  [[ 7533  4155]
 [ 1764 66736]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_lem_review (cleaned reviews that have stop words removed, stemmatized and lemmatized)  

In [148]:
# define X and y
X = df_reviews['cleaned_lem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [149]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9469746096672819
Accuracy:  0.9249264229061706
Precision:  0.9216729183282276
Recall:  0.9249264229061706
Confusion Matrix:  [[ 7890  3798]
 [ 2222 66278]]


In [150]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9348686835935551
Accuracy:  0.92619843368085
Precision:  0.9222940738422084
Recall:  0.92619843368085
Confusion Matrix:  [[ 7538  4150]
 [ 1768 66732]]


#### c) 
#### Random Forest, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review 

In [151]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [154]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    randomforest = RandomForestClassifier(n_estimators=101, criterion='entropy')
    randomforest.fit(X_train_dtm, y_train)
    predictions = randomforest.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, randomforest.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

In [155]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9999875293061306
Accuracy:  0.868733476330623
Precision:  0.8771579803819611
Recall:  0.868733476330623
Confusion Matrix:  [[ 1252 10436]
 [   90 68410]]


In [156]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9999875293061306
Accuracy:  0.8677607622088093
Precision:  0.8771270214770107
Recall:  0.8677607622088093
Confusion Matrix:  [[ 1160 10528]
 [   76 68424]]
