# Challenge: Feedback analysis

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import word_tokenize
import spacy
from collections import Counter
from textblob import TextBlob
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import RFE
import re
from sklearn.metrics import confusion_matrix

In [2]:
data_path = ('yelp_labelled.txt')
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['message','like']

In [3]:
stop = stopwords.words('english')

In [4]:
sms_raw.head()

Unnamed: 0,message,like
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
sms_raw.describe()

Unnamed: 0,like
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [6]:
new_sms = sms_raw.copy()

In [7]:
new_sms['message'] = new_sms['message'].str.lower()

In [8]:
def GetSpecialChar(x):
    special_characters = []
    for char in x:
        if char.isalpha() == False:
            special_characters.append(char)
    return special_characters

In [9]:
new_sms['special'] =  new_sms['message'].apply(lambda x : GetSpecialChar(x))

In [10]:
special_characters = []
for row in new_sms['special']:
    for char in row:
        special_characters.append(char)

In [11]:
new_sms['message'] = new_sms['message'].str.replace('!', '')
new_sms['message'] = new_sms['message'].str.replace('"', '')
new_sms['message'] = new_sms['message'].str.replace('$', '')
new_sms['message'] = new_sms['message'].str.replace("'", '')
new_sms['message'] = new_sms['message'].str.replace('%', '')
new_sms['message'] = new_sms['message'].str.replace('&', '')
new_sms['message'] = new_sms['message'].str.replace('(', '')
new_sms['message'] = new_sms['message'].str.replace(')', '')
new_sms['message'] = new_sms['message'].str.replace('*', '')
new_sms['message'] = new_sms['message'].str.replace('+', '')
new_sms['message'] = new_sms['message'].str.replace(',', '')
new_sms['message'] = new_sms['message'].str.replace('-', '')
new_sms['message'] = new_sms['message'].str.replace('.', '')
new_sms['message'] = new_sms['message'].str.replace('/', '')
new_sms['message'] = new_sms['message'].str.replace('0', '')
new_sms['message'] = new_sms['message'].str.replace('1', '')
new_sms['message'] = new_sms['message'].str.replace('2', '')
new_sms['message'] = new_sms['message'].str.replace('3', '')
new_sms['message'] = new_sms['message'].str.replace('4', '')
new_sms['message'] = new_sms['message'].str.replace('5', '')
new_sms['message'] = new_sms['message'].str.replace('6', '')
new_sms['message'] = new_sms['message'].str.replace('7', '')
new_sms['message'] = new_sms['message'].str.replace('8', '')
new_sms['message'] = new_sms['message'].str.replace('9', '')
new_sms['message'] = new_sms['message'].str.replace(':', '')
new_sms['message'] = new_sms['message'].str.replace(';', '')
new_sms['message'] = new_sms['message'].str.replace('?', '')

In [12]:
# remove stop words
new_sms['message'] = new_sms['message'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
new_sms['message']= new_sms['message'].str.split(' ')

In [13]:
bad_reviews_df = new_sms[new_sms['like'] == 0]
good_reviews_df = new_sms[new_sms['like'] == 1]

In [14]:
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for row in text:
        for str in row:
            words.append(str)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
bad_word_freq = word_frequencies(bad_reviews_df['message']).most_common(30)
good_word_freq = word_frequencies(good_reviews_df['message']).most_common(30)
print('good words:', good_word_freq)
print('bad words:', bad_word_freq)

good words: [('good', 73), ('great', 70), ('food', 60), ('place', 57), ('service', 46), ('friendly', 23), ('delicious', 23), ('back', 23), ('nice', 22), ('time', 22), ('really', 22), ('best', 22), ('amazing', 21), ('also', 18), ('like', 17), ('restaurant', 17), ('go', 17), ('love', 16), ('staff', 15), ('vegas', 15), ('first', 13), ('menu', 12), ('always', 12), ('fantastic', 12), ('experience', 12), ('awesome', 12), ('pretty', 11), ('made', 11), ('loved', 10), ('definitely', 10)]
bad words: [('food', 64), ('place', 49), ('back', 38), ('service', 37), ('like', 29), ('go', 26), ('dont', 25), ('good', 22), ('never', 22), ('would', 21), ('time', 20), ('ever', 19), ('minutes', 19), ('bad', 18), ('one', 16), ('much', 15), ('got', 15), ('wont', 15), ('really', 14), ('disappointed', 14), ('worst', 13), ('think', 13), ('going', 13), ('wasnt', 13), ('ive', 13), ('came', 12), ('eat', 12), ('us', 12), ('im', 12), ('slow', 11)]


In [15]:
keywords = ['nice', 'back', 'great', 'friendly', 'delicious', 'dont', 'never', 'would', 'good', 'nice', 'time', 'staff',
            'restaurant', 'love', 'bad', 'amazing', 'disappointed', 'never', 'best', 'one', 'minutes', 'wont', 'wasnt',
            'amazing', 'always', 'made', 'pretty', 'loved', 'menu', 'slow', 'definitely']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [16]:
data = sms_raw[keywords]
target = sms_raw['like']

In [17]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 364


In [18]:
selectr = RFE(bnb)
selector = selectr.fit(data, target)
rankings = pd.DataFrame({'Features': data.columns, 'Ranking' : selector.ranking_})
rankings.sort_values('Ranking')

Unnamed: 0,Features,Ranking
15,amazing,1
28,menu,1
23,amazing,1
4,delicious,1
5,dont,1
6,never,1
7,would,1
22,wasnt,1
21,wont,1
20,minutes,1


In [19]:
y_pred = bnb.predict(data)
confusion_matrix(target, y_pred)

array([[464,  36],
       [328, 172]], dtype=int64)

This model is around 64% accurate with a lot  more false negative than I would like

In [20]:
amazon_data = ('amazon_cells_labelled.txt')
amazon_raw = pd.read_csv(amazon_data, delimiter= '\t', header=None)
amazon_raw.columns = ['message','like']

In [21]:
for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    amazon_raw[str(key)] = amazon_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [22]:
amazon_data = amazon_raw[keywords]
amazon_target = amazon_raw['like']

In [23]:
# Instantiate our model and store it in a new variable.
amazon_bnb = BernoulliNB()

# Fit our model to the data.
amazon_bnb.fit(amazon_data, amazon_target)

# Classify, storing the result in a new variable.
amazon_y_pred = amazon_bnb.predict(amazon_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    amazon_data.shape[0],
    (amazon_target == y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 506


In [24]:
amazon_selectr = RFE(amazon_bnb)
amazon_selector = amazon_selectr.fit(amazon_data, amazon_target)
amazon_rankings = pd.DataFrame({'Features': amazon_data.columns, 'Ranking' : amazon_selector.ranking_})
amazon_rankings.sort_values('Ranking')

Unnamed: 0,Features,Ranking
15,amazing,1
1,back,1
28,menu,1
3,friendly,1
4,delicious,1
5,dont,1
22,wasnt,1
27,loved,1
16,disappointed,1
21,wont,1


In [25]:
y_pred = bnb.predict(amazon_data)
confusion_matrix(target, y_pred)

array([[451,  49],
       [416,  84]], dtype=int64)

#### the results from Yelp did not  translate very well to Amazon, it especially had a lot of  false negatives and was not even showing 50% correct