In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

bnb = BernoulliNB()

In [2]:
df = pd.read_csv('yelp_labelled.txt', sep='\t', header=None)

In [3]:
df.rename(columns={0: 'Review', 1: 'Positive'}, inplace=True)

In [4]:
# replace punctuation 
df['Review'] = df.Review.str.replace(r'[^a-zA-Z\d\s:]', '')
# make lower case
df['Review'] = df['Review'].str.lower()

In [5]:
df.head()

Unnamed: 0,Review,Positive
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [6]:
#Words used in positive reviews:
goodwords = df.Review[df.Positive == 1].str.cat(sep=' ').split()
#Words used in negative reviews:
badwords = df.Review[df.Positive == 0].str.cat(sep=' ').split()

In [7]:
#Unique words from goodwords and badwords:
print('Positive:', len(np.unique(goodwords)), 'Negative:', len(np.unique(badwords)))

Positive: 1246 Negative: 1397


In [8]:
#"diffs" contain lists of words unique to goodwords, badwords.
gooddiff = np.setdiff1d(ar1=pd.Series(goodwords).value_counts().keys(),
                   ar2=pd.Series(badwords).value_counts().keys())
baddiff = np.setdiff1d(ar1=pd.Series(badwords).value_counts().keys().ravel(),
                   ar2=pd.Series(goodwords).value_counts().keys().ravel())

In [9]:
#Reduce gooddiff to list of words that occur three or more times:
good_df = pd.DataFrame(goodwords)
good_freq = good_df[good_df[0].isin(gooddiff)]
good_rank = pd.DataFrame(good_freq[0].value_counts()).reset_index()
top_good = good_rank[good_rank[0] >= 3]['index']
top_good.ravel()

array(['great', 'delicious', 'fantastic', 'awesome', 'loved', 'perfect',
       'excellent', 'spot', 'happy', 'wonderful', 'town', 'tender',
       'enjoyed', 'bacon', 'reasonable', 'bread', 'steaks', 'options',
       'incredible', 'fun', 'visit', 'wrong', 'moist', 'white', 'greek',
       'generous', 'ambience', 'flavorful', 'beautiful', 'hummus',
       'boyfriend', 'second', 'until', 'homemade', 'healthy', 'delish',
       'perfectly', 'interesting', 'pleased', 'mouth', 'party', 'favorite',
       'outstanding', 'decor', 'pita', 'melt', 'youre', 'wow', 'duck',
       'regular', 'butter', 'recommendation', 'patio', 'fine', 'chef',
       'cool'], dtype=object)

Remove context-specific or coincidental words (words whose definitions are not inherently positive/negative, or are not associated with positive/negative food experiences):

spot
town
bread
bacon
wrong
steaks
chef
duck
white
greek
hummus
butter
pita
melt
second
boyfriend
youre
until

In [10]:
top_good_edited = ['great', 'delicious', 'fantastic', 'awesome', 'loved', 'perfect',
       'excellent', 'happy', 'wonderful', 'tender', 'reasonable', 'enjoyed', 'incredible',
       'visit', 'fun', 'options', 'favorite', 'patio',
       'recommendation', 'fine', 'cool', 'healthy',
       'regular', 'beautiful', 'party', 'homemade', 'perfectly', 'moist',
       'flavorful', 'pleased', 'wow',
       'decor', 'delish', 'outstanding', 'mouth', 'ambience',
       'generous', 'interesting']

In [11]:
bad_df = pd.DataFrame(badwords)
bad_freq = bad_df[bad_df[0].isin(baddiff)]
bad_rank = pd.DataFrame(bad_freq[0].value_counts()).reset_index()
top_bad = bad_rank[bad_rank[0] >= 3]['index']
top_bad.ravel()

array(['minutes', 'bad', 'wasnt', 'bland', 'slow', 'terrible', 'probably',
       'waited', 'overpriced', 'rude', 'took', 'horrible', 'poor',
       'should', 'mediocre', 'her', 'hard', 'money', 'management',
       'either', 'waste', 'sick', 'anytime', '10', 'waiting', 'long', '30',
       'disappointing', 'tasteless', 'avoid', 'zero', 'asked', 'barely',
       'tables', 'business', 'elsewhere', 'sad', 'dry', 'why', 'look',
       'sucked', 'live', 'disappointment', 'unfortunately', '1', 'average',
       'dirty', 'location', '12', 'wanted', 'yourself', 'sashimi', 'ok',
       'worse', 'restaurants', 'busy', 'guess', 'insulted', 'maybe',
       'totally', 'stale', 'style', 'nasty', 'stomach', 'sucks', 'awful',
       'lacked', 'else', 'none', 'edible', 'please', 'unless', 'total',
       'frozen', 'literally', 'wouldnt', '35', 'rather', 'water', 'soggy',
       'although', 'salt', 'watched', 'star'], dtype=object)

Removed words:

probably
her
either
30
10
1
live
sashimi
star
12
35

In [12]:
top_bad_edited = ['minutes', 'bad', 'wasnt', 'bland', 'slow', 'waited',
       'terrible', 'overpriced', 'took', 'rude', 'hard', 'poor', 'management',
        'money', 'mediocre', 'should', 'horrible',
       'tasteless', 'waste', 'asked', 'waiting', 'tables',
       'anytime', 'barely', 'avoid', 'disappointing', 'long', 'sick',
       'zero', 'sucked', 'unfortunately', 'disappointment', 'location',
       'business', 'sad', 'elsewhere', 'dirty', 'why', 'dry',
       'average', 'look', 'restaurants', 'guess', 'maybe',
       'watched', 'stomach', 'none', 'lacked', 'totally', 'sucks',
       'wouldnt', 'please', 'nasty', 'else', 'wanted', 'insulted',
       'water', 'edible', 'stale', 'busy', 'yourself', 'rather',
       'literally', 'soggy', 'total', 'worse', 'salt', 'ok', 'unless',
       'frozen', 'although', 'style', 'awful']

In [13]:
top_good = top_good.tolist()
top_bad = top_bad.tolist()

In [14]:
top_words = top_good + top_bad
top_words_edited = top_good_edited + top_bad_edited

In [15]:
top_good_edited

['great',
 'delicious',
 'fantastic',
 'awesome',
 'loved',
 'perfect',
 'excellent',
 'happy',
 'wonderful',
 'tender',
 'reasonable',
 'enjoyed',
 'incredible',
 'visit',
 'fun',
 'options',
 'favorite',
 'patio',
 'recommendation',
 'fine',
 'cool',
 'healthy',
 'regular',
 'beautiful',
 'party',
 'homemade',
 'perfectly',
 'moist',
 'flavorful',
 'pleased',
 'wow',
 'decor',
 'delish',
 'outstanding',
 'mouth',
 'ambience',
 'generous',
 'interesting']

In [16]:
def test_word_list(keyword_list, spaces=True, show_heatmap=False, print_corrmat=False, show_keywords=False):
    # Create new "features" DataFrame object and include terms to match:
    features = df

    for key in keyword_list:
        if spaces == True:
            features[str(key)] = features.Review.str.contains(' '+ str(key) + ' ', case=False)
        elif spaces == False:
            features[str(key)] = features.Review.str.contains(str(key), case=False)
    
    # Prepare Bernoulli Naïve Bayes arguments:
    
    data = features[keyword_list]
    target = features['Positive']    

    bnb.fit(data, target)
    y_pred = bnb.predict(data)
    
    # Add 'prediction' column to features DataFrame:
    
    features['prediction'] = y_pred
    
    # Create correlation matrix and heatmap, with optional settings to show them:
    
    corrmat = features.corr()
    
    if show_heatmap == True:
        plt.rcParams['figure.figsize'] = 12, 12
        sns.heatmap(df.corr(), vmin=-.5, vmax=.5, square=True, cmap='RdBu_r')
        plt.show()
    else:
        print('(Set "show_heatmap=True" to show heatmap)')
              
    if print_corrmat == True:          
        print(corrmat)
    else:
        print('(Set "print_corrmat=True" to show correlation matrix)')

    if show_keywords == True:
        print(keyword_list)
    else:
        print('(Set "show_keywords=True" to show keyword list)')
    
    # Return relevant data in print statements:
    
    print('\n')
    print("Number of mislabeled points out of a total {} points : {}".format(
        data.shape[0],
        (target != y_pred).sum()
    ))    
    
    cvs = cross_val_score(bnb, data, target, cv=10)
    print("Cross Validation Scores (10 Folds):", cvs)
    
    print("Averaged Cross-Validation Score: ", cvs.mean())
    
    print("Bernoulli Naïve Bayes Score: ", bnb.score(data, target))
    print("Confusion Matrix:\n", confusion_matrix(target, y_pred))

In [17]:
test_word_list(top_good_edited)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 352
Cross Validation Scores (10 Folds): [ 0.65  0.64  0.63  0.6   0.67  0.61  0.71  0.65  0.64  0.66]
Averaged Cross-Validation Score:  0.646
Bernoulli Naïve Bayes Score:  0.648
Confusion Matrix:
 [[500   0]
 [352 148]]


In [18]:
test_word_list(top_good_edited, spaces=False)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 284
Cross Validation Scores (10 Folds): [ 0.68  0.7   0.7   0.7   0.75  0.7   0.76  0.73  0.72  0.71]
Averaged Cross-Validation Score:  0.715
Bernoulli Naïve Bayes Score:  0.716
Confusion Matrix:
 [[496   4]
 [280 220]]


In [27]:
test_word_list(top_bad_edited)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 307
Cross Validation Scores (10 Folds): [ 0.65  0.7   0.65  0.65  0.68  0.7   0.68  0.73  0.73  0.68]
Averaged Cross-Validation Score:  0.685
Bernoulli Naïve Bayes Score:  0.693
Confusion Matrix:
 [[193 307]
 [  0 500]]


In [20]:
test_word_list(top_bad_edited, spaces=False)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 243
Cross Validation Scores (10 Folds): [ 0.7   0.78  0.71  0.71  0.7   0.77  0.79  0.78  0.76  0.76]
Averaged Cross-Validation Score:  0.746
Bernoulli Naïve Bayes Score:  0.757
Confusion Matrix:
 [[268 232]
 [ 11 489]]


In [21]:
test_word_list(top_words_edited)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 307
Cross Validation Scores (10 Folds): [ 0.65  0.7   0.65  0.65  0.68  0.7   0.68  0.73  0.73  0.68]
Averaged Cross-Validation Score:  0.685
Bernoulli Naïve Bayes Score:  0.693
Confusion Matrix:
 [[193 307]
 [  0 500]]


In [22]:
test_word_list(top_words_edited, spaces=False)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 239
Cross Validation Scores (10 Folds): [ 0.71  0.8   0.72  0.72  0.72  0.78  0.79  0.79  0.77  0.8 ]
Averaged Cross-Validation Score:  0.76
Bernoulli Naïve Bayes Score:  0.761
Confusion Matrix:
 [[268 232]
 [  7 493]]


In [23]:
test_word_list(top_good)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 320
Cross Validation Scores (10 Folds): [ 0.68  0.67  0.65  0.64  0.71  0.64  0.75  0.66  0.69  0.69]
Averaged Cross-Validation Score:  0.678
Bernoulli Naïve Bayes Score:  0.68
Confusion Matrix:
 [[500   0]
 [320 180]]


In [24]:
test_word_list(top_bad)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 290
Cross Validation Scores (10 Folds): [ 0.65  0.71  0.67  0.7   0.68  0.71  0.69  0.76  0.74  0.69]
Averaged Cross-Validation Score:  0.7
Bernoulli Naïve Bayes Score:  0.71
Confusion Matrix:
 [[210 290]
 [  0 500]]


In [25]:
test_word_list(top_words)

(Set "show_heatmap=True" to show heatmap)
(Set "print_corrmat=True" to show correlation matrix)
(Set "show_keywords=True" to show keyword list)


Number of mislabeled points out of a total 1000 points : 290
Cross Validation Scores (10 Folds): [ 0.65  0.71  0.67  0.7   0.68  0.71  0.69  0.76  0.74  0.69]
Averaged Cross-Validation Score:  0.7
Bernoulli Naïve Bayes Score:  0.71
Confusion Matrix:
 [[210 290]
 [  0 500]]
