In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer #The vectorizer
from sklearn.naive_bayes import MultinomialNB #The algorithm
from sklearn.model_selection import cross_val_score #The accuracy check


Written by Chad Valencia, chadvalencia@gmail.com

Data sourced from:
https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
for the paper:
From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015 


In [None]:
%time yelp_reviews = pd.read_csv('data/yelp_labelled.txt',header=None,delimiter='\t')

In [None]:
yelp_reviews.head(20)

In [None]:
yelp_reviews.describe()

In [None]:
#To-Do: Run count-vectorizer and Naive Bayes.

In [None]:
#Count Vectorizer does a word count of every word in the overall vocabulary

#yelp reviews
countvec = CountVectorizer()
%time X_yelp = countvec.fit_transform(yelp_reviews[0]) 
#the new feature set based off of the vocabulary
y_yelp = yelp_reviews[1] # this is the target column

In [None]:
#Creating the models
yelp = MultinomialNB() #Naive Bayes Model
%time yelp.fit(X_yelp,y_yelp)

#Model validation using K-fold cross validation
scores = cross_val_score(yelp,X_yelp,y_yelp,cv=10)
print(scores)
avg_score = scores.mean()
print("average score (accuracy):", avg_score)#makes a prediction using Xy_test, scores accuracy to Yy_test

In [None]:
def testing(test_sentence):
    yp = yelp.predict(countvec.transform([test_sentence]))
    ypr = yelp.predict_proba(countvec.transform([test_sentence]))*100
    posneg = []
    if yp == 1:
        posneg.append('positive')
    else:
        posneg.append('negative')
    print('This review predicts '+str(posneg)+' with a positive probability of {:.04f} %'.format(ypr[0][1]))
    pass

In [None]:
test_sentence = 'The service was great.'

In [None]:
testing(test_sentence)