In [2]:
#Sentiment analysis
#train the model to see of each review of a restaurant in positive or negative

In [3]:
#Bag of words-> we create a sparse matrix in which each column has new words in the review

In [4]:
#tsv file-> Tab seperated values (1000 text reviews),features and dependent variable are separated by tab.


In [5]:
#Import The Libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
#Import the dataset (Reading the tsv file)
df = pd.read_csv('Restaurant_Reviews.tsv',delimiter = "\t",quoting = 3)
#quoting=3 ->ignore all the double quotes " " from the data,doing this avoids parsing errors



In [8]:
print(df.head())

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


In [9]:
#what are stop words?(words which dont give any hint whether the reviwe is positive or negative(the,a, etc.))
#we dont want to include them in reviews after cleaning them,they are not relevant for prediction

In [10]:
#what is stemming ? ->Consists of taking only the "root" of the word that indicates enough about what thsi word means
#eg:"I loved this restaurant"->"loved" considered as "love"
#it minimizes the dimension of the sparse matrix by removing redundant words eg:loved,loving,lovable etc.

In [11]:
#cleaning the dat->cleaning the text
import re #simplify the text
import nltk   #to download the ensemble of stop words
nltk.download('stopwords')  #downloads stopwords
from nltk.corpus import stopwords #imports the stopwords in the notebook
from nltk.stem .porter import PorterStemmer #apply stemming on our reviews


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
corpus = [] #initialize as empty list , will contain all different reviews which are cleaned

In [13]:
#clean the reviews and add in corpus
for i in range(0,1000): #1000 reviews in dataset
    review = re.sub('[^a-zA-Z]',' ',df['Review'][i]) #1st cleaning step-> "Removing punctuations"
                                                     #replace punctuations by space('space' because otherwise two words can stick together)
                                                     #^ ->not(i.e we want to replace anything which is not an alphabet)
    
    review=review.lower()                            #2nd cleaning step -> "Same case"
                                                     #review is a object of re library so call lower function
    
    review=review.split()                            #2nd cleaning step -> "Spliting the review into its elements i.e words"
                                                     #spliting is done as a preparation of stemming
    
    
    ps =PorterStemmer()                              #3rd cleaning step -> "Remove stop words and then apply stemming"    
    
    stopwords_list=stopwords.words('english')        #originally stopwords had "not",but we want "not" to be present in the stemmed data
    stopwords_list.remove('not')
    review = [ps.stem(word) for word in review if not word in set(stopwords_list)]  #apply stemmin for all the words in a particular review except the stop words     
    
    review = ' '.join(review) #join the stemmed words back into string with space in between them
    
    corpus.append(review)

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
#You can customize use of stopwords,always check the list of stopwords 

In [15]:
def headls(ls,num):
    if num==0:
        print (ls[0])
    
    
    for i in range(0,num):
        print (ls[i])
        
print(headls(corpus,5))

wow love place
crust not good
not tasti textur nasti
stop late may bank holiday rick steve recommend love
select menu great price
None


In [16]:
#CREATE 'Bag of words' model

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features = 1500) #parameter value chosen as 1500 after checking the len of x below
#originally the length of x was 1566

In [17]:
# After removing stop words we still have some words like (textur,bank,holiday,menu etc.) which are not relevant to decide the sentiment
#To remove these kind of words , include only the frequent words

In [18]:
#fit will take all the words from all the corpus entities and transform will put them in the columns of matrix x
x = cv.fit_transform(corpus).toarray()    #matrix of features ,has to be a 2d array
y = df.iloc[ : , -1].values

In [19]:
#Number of words resulting from the tokenization
len(x[0])  #total number of columsn in the matrix(i.e total number of different words among all corpus entries)

1500

In [20]:
#SPLITTING THE DATASET into the training set and test set

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [22]:
print(x.shape)
print(y.shape)

(1000, 1500)
(1000,)


In [23]:
#Training the naive bayes model 

In [24]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)

In [25]:
#Predicting the test results
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))
#displaying the predicted and actual values side by side

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

In [26]:
#Making the confusion matrix (calculate accuracy scores)

In [27]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[55 42]
 [12 91]]
0.73


In [31]:
#Now if there comes a new review, classify the sentiment
new_review = 'I hate this restaurant'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]
