In [2]:
import pandas as pd
import numpy as np
import string
import re

In [3]:
df_reviews = pd.read_csv('final_restaurant_review_data.csv', lineterminator='\n')

In [4]:
df_reviews['full_review']

0         Actually ordered online with Deliveroo! disapp...
1         Hate to write bad reviews but.... Bad service,...
2         Tempting pizzas I went with a couple a family ...
3         Quick lunch pizza Great pizza. Easy to order a...
4         Nice italian thin crust pizzas Good for casual...
                                ...                        
400936    Poor service and food is below average. Wanted...
400937    Questionable Service Very poor service indeed....
400938    Terrible Experience Waited for 2 hours for my ...
400939    Not worth it. Order via deliveroo. Will be my ...
400940    A genuine review.. Guys, never ever order onli...
Name: full_review, Length: 400941, dtype: object

#### Converting full_review column to str for pre-processing

In [5]:
df_reviews['full_review'] = df_reviews['full_review'].astype(str)
df_reviews['full_review'].head()

0    Actually ordered online with Deliveroo! disapp...
1    Hate to write bad reviews but.... Bad service,...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza. Easy to order a...
4    Nice italian thin crust pizzas Good for casual...
Name: full_review, dtype: object

### Text Pre-processing

#### Punctuation removal
##### All the punctuations from the text are removed. string library of Python contains some pre-defined list of punctuations such as ‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
# function to remove punctuation
def remove_punctuation(review):
    review_without_punctuation = "".join([i for i in review if i not in string.punctuation])
    return review_without_punctuation

df_reviews['cleaned_review'] = df_reviews['full_review'].apply(lambda x: remove_punctuation(x))
df_reviews.head()

Unnamed: 0,rating,date,title,description,date_of_visit,url,num_of_img_uploaded,full_review,num_of_tokens_title,num_of_tokens_description,num_of_tokens_full_review,title_sentiment,description_sentiment,full_review_sentiment,review_sentiment_category,cleaned_review
0,3.0,2021-12-04,Actually ordered online with Deliveroo! disapp...,Ordered a chicken Cobb salad which is meant to...,2021-12-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Actually ordered online with Deliveroo! disapp...,6.0,61.0,67.0,-0.3,-0.3625,-0.3625,0.0,Actually ordered online with Deliveroo disappo...
1,1.0,2021-01-26,Hate to write bad reviews but....,"Bad service, bad attitude of owner and average...",2021-01-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,"Hate to write bad reviews but.... Bad service,...",6.0,16.0,22.0,-0.75,-0.13,-0.13,0.0,Hate to write bad reviews but Bad service bad ...
2,4.0,2020-08-21,Tempting pizzas,I went with a couple a family friend pair who ...,2020-03-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Tempting pizzas I went with a couple a family ...,2.0,34.0,36.0,0.0,0.566667,0.566667,1.0,Tempting pizzas I went with a couple a family ...
3,5.0,2019-11-09,Quick lunch pizza,Great pizza. Easy to order and delivery was ri...,2019-11-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,1.0,Quick lunch pizza Great pizza. Easy to order a...,3.0,19.0,22.0,0.333333,0.479762,0.479762,1.0,Quick lunch pizza Great pizza Easy to order an...
4,3.0,2019-11-06,Nice italian thin crust pizzas,Good for casual dining with friends. Tasty ita...,2019-10-01,https://www.tripadvisor.com.sg/Restaurant_Revi...,0.0,Nice italian thin crust pizzas Good for casual...,5.0,21.0,26.0,0.066667,0.15,0.15,1.0,Nice italian thin crust pizzas Good for casual...


#### Removing numbers as they do not hold information

In [8]:
df_reviews['cleaned_review'].replace('\d', '', regex=True, inplace=True)
df_reviews['cleaned_review'].head()

0    Actually ordered online with Deliveroo disappo...
1    Hate to write bad reviews but Bad service bad ...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza Easy to order an...
4    Nice italian thin crust pizzas Good for casual...
Name: cleaned_review, dtype: object

#### Removing emojis as they do not hold information

In [9]:
def remove_emojis(review):
    return review.encode('ascii', 'ignore').decode('ascii')
df_reviews['cleaned_review'] = df_reviews['cleaned_review'].apply(lambda x: remove_emojis(x))
df_reviews['cleaned_review'].head()

0    Actually ordered online with Deliveroo disappo...
1    Hate to write bad reviews but Bad service bad ...
2    Tempting pizzas I went with a couple a family ...
3    Quick lunch pizza Great pizza Easy to order an...
4    Nice italian thin crust pizzas Good for casual...
Name: cleaned_review, dtype: object

#### Removing nltk english stopwords

In [10]:
# import nltk library
import nltk
nltk.download('stopwords')
# import stopwords fron nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

# function to remove nltk english stopwords 
def remove_stopwords(review):
    review_words = review.split()
    noise_free_words = [word for word in review_words if word not in stopwords]
    noise_free_review = " ".join(noise_free_words)
    return noise_free_review

df_reviews['cleaned_sw_review'] = df_reviews['cleaned_review'].apply(lambda x: remove_stopwords(x))
df_reviews[['cleaned_review', 'cleaned_sw_review']]

[nltk_data] Downloading package stopwords to /Users/dylan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,cleaned_review,cleaned_sw_review
0,Actually ordered online with Deliveroo disappo...,Actually ordered online Deliveroo disappointin...
1,Hate to write bad reviews but Bad service bad ...,Hate write bad reviews Bad service bad attitud...
2,Tempting pizzas I went with a couple a family ...,Tempting pizzas I went couple family friend pa...
3,Quick lunch pizza Great pizza Easy to order an...,Quick lunch pizza Great pizza Easy order deliv...
4,Nice italian thin crust pizzas Good for casual...,Nice italian thin crust pizzas Good casual din...
...,...,...
400936,Poor service and food is below average Wanted ...,Poor service food average Wanted try indian cu...
400937,Questionable Service Very poor service indeed ...,Questionable Service Very poor service indeed ...
400938,Terrible Experience Waited for hours for my f...,Terrible Experience Waited hours food delivery...
400939,Not worth it Order via deliveroo Will be my fi...,Not worth Order via deliveroo Will first last ...


#### Stemming

In [11]:
# string split for Stemming
df_reviews['cleaned_stem_review'] = df_reviews['cleaned_sw_review'].apply(lambda x: x.split())

#importing the Stemming function from nltk library
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer(language='english')

# function for snowball stemming
def snowball_stemming(review):
    stem_review = [snowball_stemmer.stem(word) for word in review]
    return stem_review

df_reviews['cleaned_stem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: snowball_stemming(x))
df_reviews[['cleaned_review', 'cleaned_sw_review', 'cleaned_stem_review']]

Unnamed: 0,cleaned_review,cleaned_sw_review,cleaned_stem_review
0,Actually ordered online with Deliveroo disappo...,Actually ordered online Deliveroo disappointin...,"[actual, order, onlin, deliveroo, disappoint, ..."
1,Hate to write bad reviews but Bad service bad ...,Hate write bad reviews Bad service bad attitud...,"[hate, write, bad, review, bad, servic, bad, a..."
2,Tempting pizzas I went with a couple a family ...,Tempting pizzas I went couple family friend pa...,"[tempt, pizza, i, went, coupl, famili, friend,..."
3,Quick lunch pizza Great pizza Easy to order an...,Quick lunch pizza Great pizza Easy order deliv...,"[quick, lunch, pizza, great, pizza, easi, orde..."
4,Nice italian thin crust pizzas Good for casual...,Nice italian thin crust pizzas Good casual din...,"[nice, italian, thin, crust, pizza, good, casu..."
...,...,...,...
400936,Poor service and food is below average Wanted ...,Poor service food average Wanted try indian cu...,"[poor, servic, food, averag, want, tri, indian..."
400937,Questionable Service Very poor service indeed ...,Questionable Service Very poor service indeed ...,"[question, servic, veri, poor, servic, inde, w..."
400938,Terrible Experience Waited for hours for my f...,Terrible Experience Waited hours food delivery...,"[terribl, experi, wait, hour, food, deliveri, ..."
400939,Not worth it Order via deliveroo Will be my fi...,Not worth Order via deliveroo Will first last ...,"[not, worth, order, via, deliveroo, will, firs..."


#### Lemmatization

In [12]:
# importing WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(review):
    lemm_review = [wordnet_lemmatizer.lemmatize(word) for word in review]
    return lemm_review

df_reviews['cleaned_lem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: lemmatizer(x))
df_reviews[['cleaned_sw_review', 'cleaned_stem_review', 'cleaned_lem_review']]


[nltk_data] Downloading package wordnet to /Users/dylan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,cleaned_sw_review,cleaned_stem_review,cleaned_lem_review
0,Actually ordered online Deliveroo disappointin...,"[actual, order, onlin, deliveroo, disappoint, ...","[actual, order, onlin, deliveroo, disappoint, ..."
1,Hate write bad reviews Bad service bad attitud...,"[hate, write, bad, review, bad, servic, bad, a...","[hate, write, bad, review, bad, servic, bad, a..."
2,Tempting pizzas I went couple family friend pa...,"[tempt, pizza, i, went, coupl, famili, friend,...","[tempt, pizza, i, went, coupl, famili, friend,..."
3,Quick lunch pizza Great pizza Easy order deliv...,"[quick, lunch, pizza, great, pizza, easi, orde...","[quick, lunch, pizza, great, pizza, easi, orde..."
4,Nice italian thin crust pizzas Good casual din...,"[nice, italian, thin, crust, pizza, good, casu...","[nice, italian, thin, crust, pizza, good, casu..."
...,...,...,...
400936,Poor service food average Wanted try indian cu...,"[poor, servic, food, averag, want, tri, indian...","[poor, servic, food, averag, want, tri, indian..."
400937,Questionable Service Very poor service indeed ...,"[question, servic, veri, poor, servic, inde, w...","[question, servic, veri, poor, servic, inde, w..."
400938,Terrible Experience Waited hours food delivery...,"[terribl, experi, wait, hour, food, deliveri, ...","[terribl, experi, wait, hour, food, deliveri, ..."
400939,Not worth Order via deliveroo Will first last ...,"[not, worth, order, via, deliveroo, will, firs...","[not, worth, order, via, deliveroo, will, firs..."


#### a) 
#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review  

In [13]:
df_reviews = df_reviews[df_reviews['review_sentiment_category'].notna()]
df_reviews['review_sentiment_category'].astype(int)
df_reviews['rating'].isnull().values.any()

False

In [14]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def nb_tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    predictions = nb.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, nb.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

    # for i, j in enumerate(nb.classes_):
    #     coefficients = nb.coef_[i]
    #     weights = list(zip(vect.get_feature_names(), coefficients))
    #     print('Most Positive Coefficients:')
    #     print(sorted(weights,key=lambda x: -x[1])[:10])
    #     print('Most Negative Coefficients:')
    #     print(sorted(weights,key=lambda x: x[1])[:10])

In [17]:
countvect = CountVectorizer()
nb_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8949125804359754
Accuracy:  0.887427046440864
Precision:  0.8845227665757122
Recall:  0.887427046440864
Confusion Matrix:  [[ 6796  4892]
 [ 4135 64365]]


In [18]:
tfidvect = TfidfVectorizer()
nb_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8571357310320746
Accuracy:  0.8548910061355813
Precision:  0.8662743849876113
Recall:  0.8548910061355813
Confusion Matrix:  [[   56 11632]
 [    4 68496]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_sw_review (cleaned reviews that have stop words removed)  

In [19]:
# define X and y
X = df_reviews['cleaned_sw_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [20]:
countvect = CountVectorizer()
nb_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8972352471691525
Accuracy:  0.8896842420312266
Precision:  0.8851804595107275
Recall:  0.8896842420312266
Confusion Matrix:  [[ 6646  5042]
 [ 3804 64696]]


In [21]:
tfidvect = TfidfVectorizer()
nb_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8590250411532898
Accuracy:  0.8555768942983988
Precision:  0.8660365119898119
Recall:  0.8555768942983988
Confusion Matrix:  [[  116 11572]
 [    9 68491]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_stem_review (cleaned reviews that have stop words removed and stemmatized)  

In [22]:
# pre-processing step to convert list column into text column
df_reviews['cleaned_stem_review'] = df_reviews['cleaned_stem_review'].apply(lambda x: ' '.join(x) )


# define X and y
X = df_reviews['cleaned_stem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [23]:
countvect = CountVectorizer()
nb_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8955704095375867
Accuracy:  0.8891729435825809
Precision:  0.8847818500317601
Recall:  0.8891729435825809
Confusion Matrix:  [[ 6646  5042]
 [ 3845 64655]]


In [24]:
tfidvect = TfidfVectorizer()
nb_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8587413328677608
Accuracy:  0.8554771287474435
Precision:  0.8642170933226381
Recall:  0.8554771287474435
Confusion Matrix:  [[  109 11579]
 [   10 68490]]


#### Naive Bayes Model, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_lem_review (cleaned reviews that have stop words removed, stemmatized and lemmatized)  

In [25]:
# pre-processing step to convert list column into text column
df_reviews['cleaned_lem_review'] = df_reviews['cleaned_lem_review'].apply(lambda x: ' '.join(x) )


# define X and y
X = df_reviews['cleaned_lem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [26]:
countvect = CountVectorizer()
nb_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8956047039457276
Accuracy:  0.8892228263580586
Precision:  0.8848612429639288
Recall:  0.8892228263580586
Confusion Matrix:  [[ 6652  5036]
 [ 3847 64653]]


In [27]:
tfidvect = TfidfVectorizer()
nb_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.8588255100513793
Accuracy:  0.8555145408290518
Precision:  0.8645456655747108
Recall:  0.8555145408290518
Confusion Matrix:  [[  112 11576]
 [   10 68490]]


#### b) 
#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review 

In [28]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def lr_tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    lr = LogisticRegression(max_iter=3000)
    lr.fit(X_train_dtm, y_train)
    predictions = lr.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, lr.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

In [30]:
countvect = CountVectorizer()
lr_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9654873547164164
Accuracy:  0.9350526263281289
Precision:  0.9331026114692008
Recall:  0.9350526263281289
Confusion Matrix:  [[ 8566  3122]
 [ 2086 66414]]


In [31]:
tfidvect = TfidfVectorizer()
lr_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9452723599541079
Accuracy:  0.9342046191450092
Precision:  0.9312205653307022
Recall:  0.9342046191450092
Confusion Matrix:  [[ 7976  3712]
 [ 1564 66936]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_sw_review (cleaned reviews that have stop words removed)  

In [32]:
# define X and y
X = df_reviews['cleaned_sw_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [33]:
countvect = CountVectorizer()
lr_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9622605626777074
Accuracy:  0.9307751783309224
Precision:  0.9283363743881674
Recall:  0.9307751783309224
Confusion Matrix:  [[ 8292  3396]
 [ 2155 66345]]


In [34]:
tfidvect = TfidfVectorizer()
lr_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9419177433032374
Accuracy:  0.9304384695964484
Precision:  0.926998546769406
Recall:  0.9304384695964484
Confusion Matrix:  [[ 7689  3999]
 [ 1579 66921]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_stem_review (cleaned reviews that have stop words removed and stemmatized)  

In [35]:
# define X and y
X = df_reviews['cleaned_stem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [36]:
countvect = CountVectorizer()
lr_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.954606674315359
Accuracy:  0.9270713822517085
Precision:  0.924116727646855
Recall:  0.9270713822517085
Confusion Matrix:  [[ 8034  3654]
 [ 2194 66306]]


In [37]:
tfidvect = TfidfVectorizer()
lr_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9377275901631167
Accuracy:  0.9278196238838728
Precision:  0.9240977259102136
Recall:  0.9278196238838728
Confusion Matrix:  [[ 7606  4082]
 [ 1706 66794]]


#### Logistic Regression, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_lem_review (cleaned reviews that have stop words removed, stemmatized and lemmatized)  

In [38]:
# define X and y
X = df_reviews['cleaned_lem_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [39]:
countvect = CountVectorizer()
lr_tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9544694966827955
Accuracy:  0.9271087943333167
Precision:  0.9241411084231149
Recall:  0.9271087943333167
Confusion Matrix:  [[ 8030  3658]
 [ 2187 66313]]


In [40]:
tfidvect = TfidfVectorizer()
lr_tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9376932957549758
Accuracy:  0.9279318601286975
Precision:  0.9242265066752277
Recall:  0.9279318601286975
Confusion Matrix:  [[ 7616  4072]
 [ 1707 66793]]


#### c) 
#### Random Forest, using CountVectorizer and TFIDVectorizer
#### Model Training & Prediction on cleaned_review 

In [151]:
# define X and y
X = df_reviews['cleaned_review']
y = df_reviews['review_sentiment_category']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [154]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect, X_train, y_train, X_test, y_test):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # use Multinomial Naive Bayes to predict the review_sentiment_category
    randomforest = RandomForestClassifier(n_estimators=101, criterion='entropy')
    randomforest.fit(X_train_dtm, y_train)
    predictions = randomforest.predict(X_test_dtm)
    
    # print the training accuracy
    print('Training Accuracy: ', metrics.accuracy_score(y_train, randomforest.predict(X_train_dtm)))

    # print the accuracy of its predictions
    print('Accuracy: ', metrics.accuracy_score(y_test, predictions))

    # print the precision of the model
    print('Precision: ', precision_score(y_test, predictions, average="weighted"))

    # print the recall of the model
    print('Recall: ', recall_score(y_test, predictions, average="weighted"))

    # print confusion matrix
    print('Confusion Matrix: ', confusion_matrix(y_test, predictions))

In [155]:
countvect = CountVectorizer()
tokenize_test(countvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9999875293061306
Accuracy:  0.868733476330623
Precision:  0.8771579803819611
Recall:  0.868733476330623
Confusion Matrix:  [[ 1252 10436]
 [   90 68410]]


In [156]:
tfidvect = TfidfVectorizer()
tokenize_test(tfidvect, X_train, y_train, X_test, y_test)

Training Accuracy:  0.9999875293061306
Accuracy:  0.8677607622088093
Precision:  0.8771270214770107
Recall:  0.8677607622088093
Confusion Matrix:  [[ 1160 10528]
 [   76 68424]]
