In [1]:
#import the files 
import re 
import nltk
import ssl
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB


In [None]:

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [2]:
stop_words = list(set(stopwords.words('english')))
# JJ/R/S = adjective/comparitive/superlative
allowed_word_types = ["JJ","JJR","JJS"]

In [3]:
#open the data 
test_data = pd.read_csv('testing_set.csv')
train_data = pd.read_csv('training_set.csv')

train_data.head()

Unnamed: 0.1,Unnamed: 0,Review,Liked
0,300,Good beer & drink selection and good food sele...,1
1,301,Please stay away from the shrimp stir fried no...,0
2,302,The potato chip order was sad... I could proba...,0
3,303,Food was really boring.,0
4,304,Good Service-check!,1


In [4]:
#input tokenized list of words from review 
#return tokenized list without stop words 
def remove_stopword(data_input):
    cleaned_words = []
    for word in data_input: 
        if word not in stop_words: 
            cleaned_words.append(word)
    return (cleaned_words)

            
    

In [5]:
# input tokenized and pos tagged list 
# return list of just adjectives
def only_adjectives(data_input):
    cleaned_words = []
    for word, pos in data_input: 
        if pos in allowed_word_types:
            cleaned_words.append(word)
    return (cleaned_words)

In [6]:
#clean the training set for everything except for adjectives 
corpus = []

#lower case 
train_data["Review"] = train_data["Review"].apply(lambda x:x.lower())
#get rid of punctuation
train_data["Review"] = train_data["Review"].apply(lambda x:re.sub('[^a-zA-Z]',' ',x))
#split
train_data["Review"] = train_data["Review"].apply(lambda x:x.split())
#remove the stop words 
train_data["Review"] = train_data["Review"].apply(lambda x: remove_stopword(x))

#pos tag --> creates tuple (word,pos-tag)
train_data["Review"] = train_data["Review"] = train_data["Review"].apply(lambda x:nltk.pos_tag(x))
#Select only adjectives and remove pos-tags
train_data["Review"] = train_data["Review"].apply(lambda x:only_adjectives(x))

#maybe removing all empty is good or can tell about sentiment 



In [7]:
train_data.head(10)

Unnamed: 0.1,Unnamed: 0,Review,Liked
0,300,"[good, good]",1
1,301,[shrimp],0
2,302,[many],0
3,303,[boring],0
4,304,[good],1
5,305,[],0
6,306,[],0
7,307,"[much, atrocious]",0
8,308,[outdoor],1
9,309,[good],1


In [8]:

#create list of all adjectives from reviews in train_data
all_review_train = []
for row in train_data["Review"]:
    for word in row:
        all_review_train.append(word)

In [122]:
# BoW using count vectorizer 
cv = CountVectorizer()
cv.fit(all_review_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [123]:
cv.vocabulary_

{'good': 135,
 'shrimp': 287,
 'many': 196,
 'boring': 30,
 'much': 206,
 'atrocious': 6,
 'outdoor': 227,
 'fantastic': 107,
 'english': 91,
 'untoasted': 336,
 'great': 138,
 'high': 149,
 'lighter': 180,
 'reasonable': 260,
 'public': 253,
 'old': 224,
 'full': 128,
 'happy': 144,
 'wrong': 362,
 'exceptional': 99,
 'couple': 57,
 'amazing': 2,
 'favorite': 109,
 'black': 25,
 'peas': 236,
 'unreal': 335,
 'disappointed': 74,
 'better': 21,
 'overall': 230,
 'sick': 288,
 'gross': 142,
 'dirty': 73,
 'gold': 133,
 'authentic': 8,
 'special': 298,
 'best': 20,
 'fresh': 120,
 'worth': 361,
 'classy': 47,
 'bread': 31,
 'multiple': 207,
 'delicious': 68,
 'terrible': 316,
 'pleasant': 242,
 'enthusiastic': 95,
 'real': 259,
 'gordon': 136,
 'next': 215,
 'wonderful': 358,
 'small': 293,
 'outstanding': 229,
 'little': 182,
 'pretty': 250,
 'large': 173,
 'wasting': 349,
 'despicable': 70,
 'sushi': 308,
 'tiny': 322,
 'comfortable': 52,
 'usual': 339,
 'green': 140,
 'interesting': 16

In [124]:
train_data_2 = pd.read_csv('training_set.csv')


In [125]:
train_data_2.head()

Unnamed: 0.1,Unnamed: 0,Review,Liked
0,300,Good beer & drink selection and good food sele...,1
1,301,Please stay away from the shrimp stir fried no...,0
2,302,The potato chip order was sad... I could proba...,0
3,303,Food was really boring.,0
4,304,Good Service-check!,1


In [126]:
#do fit_transform override the previous 
X_train = cv.transform(train_data_2["Review"]).toarray()

In [127]:
y_train = train_data_2["Liked"].values

In [128]:
#train the model 
classifier = BernoulliNB(alpha=0.8)
classifier.fit(X_train, y_train)


BernoulliNB(alpha=0.8, binarize=0.0, class_prior=None, fit_prior=True)

In [129]:
# import the testing dataset 
test_data = pd.read_csv('testing_set.csv')
#vectorize the testing data 

In [130]:
X_test = cv.transform(test_data["Review"]).toarray()
y_test = test_data["Liked"].values

In [131]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))

Confusion Matrix:
 [[109  26]
 [ 60 105]]


Accuracy is  71.33 %
Precision is  0.8
Recall is  0.64
