In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR


pd.options.display.max_rows = 3000
pd.options.display.max_colwidth = 3000

#My functions
def my_sample_and_split(df, yCol, samplesize=0):
    #Take dataframe and create a smaller sample
    #return an X and Y
    
    if samplesize==0:
        sample = df
    else:
        sample = df.sample(n=samplesize)
        
    sample_y = sample[yCol]
    sample_x = sample.drop(labels=yCol, axis=1)
    print('------Sampling and Splitting Data------')
    print('Input Data Set: ' + str(df.shape))
    print('x feature set: ' + str(sample_x.shape))
    print('y Outcome set: ' + str(sample_y.shape) + '\n')
    return sample_x, sample_y


def my_get_correlations(df, threshold):
    
    # See which terms have high correlation and remove
    corrmat = df.corr()
    #print(corrmat)

    #Convert Matrix into stacked pairs
    corrmat['Col'] = corrmat.index
    df_corr = pd.melt(corrmat, id_vars=['Col'])
    
    #df_corr = df_corr[0:5]
    
    #Identify pairs with a correlation above a threshold
    print(df_corr.loc[(abs(df_corr['value']) > threshold) &
                      (df_corr['Col'] != df_corr['variable'])].sort_values('Col'))
    

In [2]:
#file = 'imdb_labelled.txt'
file = 'amazon_cells_labelled.txt'
#file = 'yelp_labelled.txt'

raw_df = pd.read_table(file, names=['Review','Positive'])

print(raw_df.shape)
raw_df.head()

(1000, 2)


Unnamed: 0,Review,Positive
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1


In [3]:
def clean_series(df, col):
        df[col] = df[col].str.lower().str.strip()
        df[col] = df[col].str.replace('.','')
        df[col] = df[col].str.replace(",",'')
        df[col] = df[col].str.replace("!",'')
        df[col] = df[col].str.replace("?",'')
        df[col] = df[col].str.replace(";",'')
        df[col] = df[col].str.replace(":",'')
        df[col] = df[col].str.replace("'",'')
        df[col] = df[col].str.replace("-",'')
        df[col] = df[col].str.replace('"','')
        df[col] = df[col].str.replace('(','')
        df[col] = df[col].str.replace(')','')
        return df
    

#Parse all reviews and get counts of each word by Postive vs Negative

#split words for positive reviews and place in list
list_positivewords = []
raw_df.loc[raw_df['Positive'] ==1, 'Review'].str.split(' ').apply(list_positivewords.extend)

#Load list into dataframe so that we can get counts in next step
df_positive = pd.DataFrame(list_positivewords, columns=['Words'])
df_positive['Words'] = clean_series(df_positive, 'Words')


#Load counts into new dataframe
array_positive = [df_positive['Words'].value_counts().index.tolist(), df_positive['Words'].value_counts().values.tolist()]  
df_freq_positive = pd.DataFrame(array_positive).transpose()
df_freq_positive.columns = ['Word','Positive_Freq']



list_negativewords = []
raw_df.loc[raw_df['Positive'] ==0, 'Review'].str.split(' ').apply(list_negativewords.extend)
df_negative = pd.DataFrame(list_negativewords, columns=['Words'])
df_negative = clean_series(df_negative, 'Words')
array_negative = [df_negative['Words'].value_counts().index.tolist(), df_negative['Words'].value_counts().values.tolist()]  
df_freq_negative = pd.DataFrame(array_negative).transpose()
df_freq_negative.columns = ['Word','Negative_Freq']

merged = pd.merge(df_freq_negative, df_freq_positive, how='outer', left_on='Word', right_on='Word')
merged.head()

#Get list of words with Nan in Positive
#merged.loc[merged.Negative_Freq.isna()]

Unnamed: 0,Word,Negative_Freq,Positive_Freq
0,the,276,237
1,i,162,154
2,it,153,128
3,and,122,188
4,a,113,105


In [4]:

#removeNegative   smell, sucks, frustrat, frequently, severe
#removePositive   

negative_words = ['poor', 'bad','waste','worst','terrible',
                  'dissapo','horrible','broke','junk','didnt',
                  'useless','difficult','return','none','crap',
                  'weak','unreliable','mistake','unfortunate',
                  'nothing','hate','boring','break',
                  'tinny','drain','refund','not good','slow'
                  'dead','fail','worthless','dying','lost',
                  'complain','essentially','unusable', 'no good',
                  'stupid','flimsy','beware','wrong','neither'
                  'forget','lose','warning','joke','bother',
                  '$$$','embarass','flaw','fail','defect','cumbersom',
                  'died','forced','toilet','wobbly','crack',
                  'shouldnt','disap','gimmick','wrong','ugly',
                  'loose','ripped','jerk','mislead','lousy',
                  'excess','stuck','counterf','fake','garbage',
                  'trash','zero','whine','pain','frustr','delay',
                  'unbear','dirty','isnt','disgust','forger',
                  'weird','unsatis','bland','restock','unaccept',
                  'noise','freeze','awkward','too expen','too price',
                  'constant','infuriat','trash','garble','muffle',
                  'suck','unaccept','regret','intermit',
                  'cheap','drawback','sorry','remorse','mess ',
                  'darn', 'damn','crawl','wish',
                  'yell','piti','refuse','fool','inexcus','phony',
                  'phoney','leak', 'not happy', 'unhappy', 'not happy', ' not',
                  'not recommend', 'be careful']

positive_words = ['nice','love','perfect','definit','priced','sturdy',
                 'value','charm','great','super','phenom','wonderf',
                 'outstand','awesom','stell', 'fast',' pleased',' comfort',
                 ' satisf','feature','beaut','fantast','overall',' glad ',
                 'rocks',' reasonabl','sharp','handy',
                 'excite','amaz','exact','tremend',' slim ','wow',
                 'certain','incred','crisp','flawless','quick','surpri',
                 ' clear','solid','attract','pleasant','smooth','plenty',
                 'awsom','fabul',' ideal','simple','prompt','highly',
                 'effective',' mega','#1','best','convenient','sweet',
                 'clever','excel','magic','entertain','enjoy','perform',
                 'exceptional','detailed','exceed','winner','hands down',
                 'engineer','100%','premium','psyched', 'seamless', 'favorite',
                 'cute','classy','pretty',' fun ',' neat', ' useful', 'thumbs up',
                 ' well', 'good', ' happy', 'recommend', 'easy', 'on time',
                 'really', 'like', 'peachy', ' fine', 'compliment', ' plus',
                 'five star', 'a+', ' 10', ' happy', 'better', ' much', 'impress',
                 'must have', 'absolut', 'brilliant', 'works', 'easi',
                 'timely', ' cool', 'complete package', 'total package',
                 'holding up', 'very', 'omg', 'delicious']


In [5]:
#Create training set
df_raw_copy = raw_df

for word in negative_words:    
    
    df_raw_copy[str(word)] = df_raw_copy['Review'].str.contains(word, case=False)


for word in positive_words:    
    
    df_raw_copy[str(word)] = df_raw_copy['Review'].str.contains(word, case=False)   
    
df_raw_copy.head()

Unnamed: 0,Review,Positive,poor,bad,waste,worst,terrible,dissapo,horrible,broke,...,works,easi,timely,cool,complete package,total package,holding up,very,omg,delicious
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
my_get_correlations(df=df_raw_copy, threshold=.80)

data, target = my_sample_and_split(df_raw_copy, 'Positive')

data.drop('Review', axis=1, inplace=True)

             Col   variable  value
50075       plus  five star    1.0
44215     clever   engineer    1.0
41663   engineer     clever    1.0
21734     excess     garble    1.0
49843  five star       plus    1.0
15238     garble     excess    1.0
------Sampling and Splitting Data------
Input Data Set: (1000, 234)
x feature set: (1000, 233)
y Outcome set: (1000,)



In [7]:
from sklearn.naive_bayes import BernoulliNB


# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

#View the mismatches
df_raw_copy['y_pred'] = y_pred
df_raw_copy.loc[df_raw_copy.y_pred != df_raw_copy.Positive, ['Review','y_pred','Positive']].sort_values('y_pred', ascending=False)


Number of mislabeled points out of a total 1000 points : 105


Unnamed: 0,Review,y_pred,Positive
588,"I only used it two days, and it wasn't always easy to hear with.",1,0
697,It doesn't make you look cool.,1,0
355,"The loudspeaker option is great, the bumpers with the lights is very ... appealing.",1,0
374,Not a good item.. It worked for a while then started having problems in my auto reverse tape player.,1,0
391,The picture resolution is far below what other comparably-priced phones are offering today.,1,0
399,Still Waiting...... I'm sure this item would work well.. if I ever recieve it!,1,0
440,Very Displeased.,1,0
559,"None of it works, just don't buy it.",1,0
563,"If you are looking for a good quality Motorola Headset keep looking, this isn't it.",1,0
620,"Steer clear of this product and go with the genuine Palm replacementr pens, which come in a three-pack.",1,0
