In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

Written by Chad Valencia, chadvalencia@gmail.com

Data sourced from:
https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
for the paper:
From Group to Individual Labels using Deep Features', Kotzias et. al,. KDD 2015 


In [2]:
yelp_reviews = pd.read_csv('data/yelp_labelled.txt',header=None,delimiter='\t')

In [3]:
imdb_reviews = pd.read_csv('data/imdb_labelled.txt',header=None,delimiter='\t')

In [4]:
amazon_reviews = pd.read_csv('data/amazon_cells_labelled.txt',header=None,delimiter='\t')

In [5]:
#Count Vectorizer does a word count of every word in the overall vocabulary

#yelp reviews
yelp_count = CountVectorizer()
X_yelp = yelp_count.fit_transform(yelp_reviews[0])
y_yelp = yelp_reviews[1]

yelp = MultinomialNB()
yelp.fit(X_yelp,y_yelp)
yelp_score = cross_val_score(yelp,X_yelp,y_yelp,cv=10)
print("Yelp accuracy: ",yelp_score.mean())

#imdb reviews
imdb_count = CountVectorizer()
X_imdb = imdb_count.fit_transform(imdb_reviews[0])
y_imdb = imdb_reviews[1]

imdb = MultinomialNB()
imdb.fit(X_imdb,y_imdb)
imdb_score = cross_val_score(imdb,X_imdb,y_imdb,cv=10)
print("imdb accuracy: ",imdb_score.mean())

#amazon reviews
amazon_count = CountVectorizer()
X_amzn = amazon_count.fit_transform(amazon_reviews[0])
y_amzn = amazon_reviews[1]

amazon = MultinomialNB()
amazon.fit(X_amzn,y_amzn)
amzn_score = cross_val_score(amazon,X_amzn,y_amzn,cv=10)
print("amazon accuracy: ",amzn_score.mean())

Yelp accuracy:  0.805
imdb accuracy:  0.7471896633475581
amazon accuracy:  0.8169999999999998


In [6]:
def testing(test_sentence):
    yp = yelp.predict(yelp_count.transform([test_sentence]))
    ip = imdb.predict(imdb_count.transform([test_sentence]))
    ap = amazon.predict(amazon_count.transform([test_sentence]))
    ypr = yelp.predict_proba(yelp_count.transform([test_sentence]))*100
    ipr = imdb.predict_proba(imdb_count.transform([test_sentence]))*100
    apr = amazon.predict_proba(amazon_count.transform([test_sentence]))*100
    list = [yp, ip, ap]
    list2 = []
    for i in list:
        if i == 1:
            list2.append('positive')
        else:
            list2.append('negative')
    list3 = ypr,ipr,apr
    list4 = ['yelp','imdb','amazon']
    for i in range(3):
        print(str(list4[i])+' predicts '+str(list2[i])+' with a positive probability of {:.04f} %'.format(list3[i][0][1]))
    pass

In [9]:
test_sentence = 'The service was great.'

In [10]:
testing(test_sentence)

yelp predicts positive with a positive probability of 99.1521 %
imdb predicts positive with a positive probability of 62.9439 %
amazon predicts positive with a positive probability of 85.9803 %
