In [21]:
# Natural Language Processing
# Importing the libraries
import pandas as pd


In [22]:
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)


In [24]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk.stem.snowball 
snow_ball = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [snow_ball.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
# Creating the Bag of Words model
#import CountVectorizer , which converts your reviews to vectors
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

#corpus contains preprocessed values only from 'Reviews' column
#fit our corpus of reviews in the countVectorizer 
#It will return a sparse matrix X 
X = cv.fit_transform(corpus).toarray()

#Getting the dimensions of the sparse matrix X
print(X.shape)


(1000, 1548)


In [None]:
# matrix size = 1000 * 1548
# Here we can observe that no. of rows = 1000, i.e reviews count
# no. of columns = 1548, this is the size of the vector
#This means the vector contains 1548 features.


In [36]:
# take all the sentiments i.e 'liked' column values in y from the dataset
# 0 = not likes (negative review)
# 1 = liked (positive review)
y = dataset.iloc[:, 1].values
print(y.shape)

(1000,)


In [None]:
# y contains only 1000 rows i.e sentiments (Liked = 0 or 1) 

In [26]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# test size = 0.20 means 20% of the dataset will be kept for testing

In [28]:
# Fitting Naive Bayes to the Training set
#use GaussianNB (for most common datasets) version
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [37]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [33]:
#Checking accuracy of model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.74


In [32]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[56 41]
 [11 92]]
