In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import itertools as it
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

In [2]:
# use the yelp file for training, change target to bool
file = 'data/yelp_labelled.txt'
df = pd.read_csv(file, sep='\t',header=None)
df.columns = ['review', 'target']
#df.target = np.where(df.target,True, False)
df.head(2)

Unnamed: 0,review,target
0,Wow... Loved this place.,1
1,Crust is not good.,0


In [3]:
file = 'data/stopwords.txt'
dfs = pd.read_csv(file,header=None)
dfs.columns = ['stops']

In [14]:
# clean and split reviewe strings 

def A(df): return df.apply(lambda x: x.str.strip(), axis=1) 
def B(df): return df.apply(lambda x: x.str.replace(r"\Bn't\b", ' not ', regex=True), axis=1)
def C(df): return df.apply(lambda x: x.str.replace(r'\W+', ' ', regex=True), axis=1)
def D(df): return df.apply(lambda x: x.str.replace(r'\d+', ' ', regex=True), axis=1)
def E(df): return df.apply(lambda x: x.str.lower(), axis=1) 
def F(df): return df.apply(lambda x: x.str.replace(r"\bnot\s", 'not', regex=True), axis=1)
def G(df): return df.apply(lambda x: x.str.replace(r'\s{2}', ' ', regex=True), axis=1)  
def H(df): return df.apply(lambda x: x.str.strip(), axis=1)
def I(df): return df.apply(lambda x: x.str.split(), axis=1)

switch = {char:eval(char) for char in 'ABCDEFGHI'}

def switch_clean_df(df, string):
    for char in string:
        df = switch[char](df)
    return df 

In [15]:
# create cleaned list of reviews as lists for target = true, target = false, all reviews
#crt = switch_clean_df(pd.DataFrame(df.review[df.target]),          'ABCDEGHI').review
#crf = switch_clean_df(pd.DataFrame(df.review[df.target == False]), 'ABCDEGHI').review
cra = switch_clean_df(pd.DataFrame(df.review),                     'ABDEFGHI').review
# create list of "all words for target = true, target = false, all reviews
#awt = [t for rt in crt for t in rt]
#awf = [f for rf in crf for f in rf]
awa = [a for ra in cra for a in ra]
# create list of words that are only in target = true, target = false
#owt = list(set(awt) - set(awf))
#owf = list(set(awf) - set(awt))
# counters of all words that are only in target = true, target = false, all reviews
ctt = Counter(); ctf = Counter(); cta = Counter()
#ctt.update([t for t in awt if t in owt]) 
#ctf.update([f for f in awf if f in owf]) 
cta.update([a for a in awa])

In [29]:
cra = switch_clean_df(pd.DataFrame(df.review),'ABCDEFGHI').review
awa = [a for ra in cra for a in ra]
words = [w for w in set(awa)]
nostops = [w for w in words ]#if w not in dfs.stops.tolist()
dfr = switch_clean_df(pd.DataFrame(df.review),'ABCDEFGH')
dfr['target'] = df.target
dct = {word:dfr.target.where(dfr.review.str.contains(word), other=np.nan) for word in nostops}
for k,v in dct.items():
    dfr[k] = v
dfr.head(2) 

Unnamed: 0,review,target,words,professional,street,mediterranean,end,ate,piano,perfectly,...,notmy,somewhat,touch,eat,steve,mood,grilled,return,accountant,refried
0,,1,,,,,,,,,...,,,,,,,,,,
1,,0,,,,,,,,,...,,,,,,,,,,


In [23]:
results = []
cmx_results = []
bnb = BernoulliNB()

In [30]:
per_in_review = .1; weight_fact = 0.20; 
A = [key for key, val in dfr.mean().items() if weight_fact >= val <= 1.0 - weight_fact]
B = [x for x in cta if cta[x] >= len(df) * per_in_review / 100]
C = list(set(A) & set(B))

data = pd.DataFrame({key:np.where(df.review.str.contains(key), True, False) for key in C})
target = np.where(df.target,True, False)
data = pd.DataFrame({key:np.where(df.review.str.contains(key), True, False) for key in C})

classify = bnb.fit(data, target)
score = classify.score(data, target)
print ('Score is %3.4f' % (score * 100)) 
y_pred = classify.predict(data)
tn, fp, fn, tp = confusion_matrix(target, y_pred).ravel()
print ('Specificity is %3.2f percent and Sensitivity is %3.2f percent ' % ((tn / (tn + fp) * 100),(tp / (fn +tp) * 100)))

results.append((per_in_review, weight_fact, score))
cmx_results.append((per_in_review, weight_fact,(tn / (tn + fp) * 100),(tp / (fn +tp) * 100)))

Score is 75.8000
Specificity is 52.40 percent and Sensitivity is 99.20 percent 


In [31]:
data.shape

(1000, 617)

# Best Result

filter with:

- key word appears in at least 0.1% of reviews
- key word accuracy at least 75%

Specificity is 60.40% and Sensitivity is 97.80%

Score is 79.10%

In [None]:
score = bnb.fit(data, target).score(data, target)
print ('Score is %3.4f' % (score * 100)) 
results.append((per_in_review, weight_fact, score))


In [None]:
y_pred = bnb.fit(data, target).predict(data)
tn, fp, fn, tp = confusion_matrix(target, y_pred).ravel()
print ('Specificity is %3.2f percent and Sensitivity is %3.2f percent ' % ((tn / (tn + fp) * 100),(tp / (fn +tp) * 100)))

In [None]:
#df with classifier features and must have 'target' columns
def fold_data(dfd, dft, n):
    assert(1 <= n <= len(dfd)); assert( len(dfd) == len(dft))
    n1 = len(dfd) // n; n2 = len(dfd) % n    
    dfd.index = pd.Series([x for x in range(n)] * n1 + [x for x in range(n2)])
    dft.index = pd.Series([x for x in range(n)] * n1 + [x for x in range(n2)])
    # creates list of tuples ==> [(x_train, y_train, x_test, y_test), ... n times]
    xys = [(dfd[dfd.index != i], dft[dft.index != i].target, dfd[dfd.index == i],
            dft[dft.index == i].target) for i in range(n)]
    return xys

In [38]:
crx = switch_clean_df(pd.DataFrame(df.review),'ABCDEFGHI').review
awa = [a for ra in cra for a in ra]

In [52]:
cta.most_common(10)

[('the', 581),
 ('and', 388),
 ('was', 305),
 ('i', 303),
 ('a', 230),
 ('to', 212),
 ('is', 174),
 ('this', 142),
 ('of', 125),
 ('it', 110)]

In [None]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)
# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))