In [114]:
#Standard Imports 

import pandas as pd
import numpy as np

In [54]:
#Load in Excel file into Pandas.

#Restaurant studied was the Monkey House Cafe in Huntington Beach, CA.

file = 'mhc_yelp_reviews.xlsx'
xlsx_file = pd.ExcelFile(file)
sheet = 'Sheet1'

In [115]:
#Parse 'Sheet 1' and return the a dataframe.

yelp_df = xlsx_file.parse('Sheet1')

#Display the head of the dataframe.

yelp_df.head()

Unnamed: 0,num_stars,review_text
0,5,Came here based on the reviews and had the Mon...
1,4,I felt it is a good place to hangout with frie...
2,4,There are a few places I eat at consistently. ...
3,5,"It's a hot Saturday afternoon, we just finishe..."
4,5,"I can't believe that I am going to say this, b..."


In [57]:
#Population of reviews favors 4- and 5-star reviews.

yelp_df.num_stars.value_counts().sort_index()

1      8
2     14
3     37
4    105
5    112
Name: num_stars, dtype: int64

In [58]:
#Function that analyzes the stars and returns a new value if the number of stars is above or below three.

def above_three(x):
    if x > 3:
        return 2
    elif x:
        return 1
    else:
        return 

In [128]:
#Create column in dataframe where 2 = review is above three and 1 = review is three or below. 
#Applies the function to analyze data and returns results in a new column within the dataframe.

yelp_df['class_num'] = yelp_df['num_stars'].apply(above_three) 

In [129]:
#Population of reviews that are above or below three stars.

yelp_df['class_num'].value_counts().sort_index()

1     59
2    217
Name: class_num, dtype: int64

In [132]:
#New dataframe with an analysis of the number of stars.

yelp_df.head()

Unnamed: 0,num_stars,review_text,class_num
0,5,Came here based on the reviews and had the Mon...,2
1,4,I felt it is a good place to hangout with frie...,2
2,4,There are a few places I eat at consistently. ...,2
3,5,"It's a hot Saturday afternoon, we just finishe...",2
4,5,"I can't believe that I am going to say this, b...",2


In [62]:
# import and instantiate CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [133]:
# Define X and y

X = yelp_df.review_text
y = yelp_df.class_num

In [134]:
#Print shape of each variable.

print(X.shape)
print(y.shape)

(276,)
(276,)


In [65]:
#Convert X into a document-term matrix and display shape of the corresponding maxtrix.

X_dtm = vect.fit_transform(X)
print(X_dtm.shape)

(276, 2921)


In [101]:
# Import and instantiate MultinomialNB model

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [102]:
#Fit the model using the document term matrix and the corresponding stars.

nb.fit(X_dtm, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [103]:
#Display all of the token names that is used by the MultinomialNB model.

X_tokens = vect.get_feature_names()
len(X_tokens)

2921

In [104]:
#Display shape of the feature count matrix.
#Remember that a 1 means that the review was three or lower stars and a 2 means that the review was more than three stars

nb.feature_count_.shape

(2, 2921)

In [105]:
#Total number of reviews in class 1 (one-, two-, or three-star revews)
#Total number of reviews in Class 2 (four-, or five-star reviews)

total_class_1 = nb.class_count_[0]
total_class_2 = nb.class_count_[1]
print(total_class_1)
print(total_class_2)

59.0
217.0


In [106]:
#Slice the feature matrix to show the feature count in each class.

class_1_features = nb.feature_count_[0, :]
class_2_features = nb.feature_count_[1, :]

In [137]:
#Create dataframe with the distribution of tokens in the dataset.

token_df = pd.DataFrame({'tokens': X_tokens, 'class_1_total': class_1_features, 'class_2_total': class_2_features}).set_index('tokens')

In [138]:
#Display the head of the new dataframe.

token_df.head()

Unnamed: 0_level_0,class_1_total,class_2_total
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1
00,5.0,0.0
00pm,0.0,3.0
03,0.0,1.0
10,2.0,2.0
100,0.0,3.0


In [139]:
#Function to add one to each column of the dataframe so that we don't have a divide by zero problem.

def add_one(num):
    return num + 1

In [140]:
#Apply the function to each column and return a new column.

token_df['class_2_percent'] = token_df['class_2_total'].apply(add_one) / total_class_2
token_df['class_1_percent'] = token_df['class_1_total'].apply(add_one) / total_class_1
token_df.head()

Unnamed: 0_level_0,class_1_total,class_2_total,class_2_percent,class_1_percent
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00,5.0,0.0,0.004608,0.101695
00pm,0.0,3.0,0.018433,0.016949
03,0.0,1.0,0.009217,0.016949
10,2.0,2.0,0.013825,0.050847
100,0.0,3.0,0.018433,0.016949


In [141]:
#Calculation three star ratio.
#A three star ratio is the likely hood that a token will contribute to a review that has more than three stars.
#Larger ratio means that the review will be more than three stars
#Smaller ratio means that the review will be three

token_df['ratio'] = token_df['class_2_percent'] / token_df['class_1_percent']
token_df.head()

Unnamed: 0_level_0,class_1_total,class_2_total,class_2_percent,class_1_percent,ratio
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00,5.0,0.0,0.004608,0.101695,0.045315
00pm,0.0,3.0,0.018433,0.016949,1.087558
03,0.0,1.0,0.009217,0.016949,0.543779
10,2.0,2.0,0.013825,0.050847,0.271889
100,0.0,3.0,0.018433,0.016949,1.087558


In [142]:
#Sort the dataframe by thre three-star ratio display the tokens that most 
#contribute to a review that has more than three stars

token_df.sort(columns='ratio', ascending=False).head(10)



Unnamed: 0_level_0,class_1_total,class_2_total,class_2_percent,class_1_percent,ratio
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
perfect,0.0,24.0,0.115207,0.016949,6.797235
look,0.0,17.0,0.082949,0.016949,4.894009
studying,0.0,16.0,0.078341,0.016949,4.62212
recommend,0.0,15.0,0.073733,0.016949,4.35023
mocha,0.0,15.0,0.073733,0.016949,4.35023
loved,0.0,15.0,0.073733,0.016949,4.35023
butter,0.0,14.0,0.069124,0.016949,4.078341
peanut,0.0,13.0,0.064516,0.016949,3.806452
friendly,4.0,64.0,0.299539,0.084746,3.534562
variety,0.0,12.0,0.059908,0.016949,3.534562


In [113]:
#Sort the dataframe by thre three-star ratio display the tokens that most 
#contribute to a review that has three or fewer stars.

token_df.sort(columns='ratio', ascending=True).head(10)



Unnamed: 0_level_0,class_1_total,class_2_total,class_2_percent,class_1_percent,ratio
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00,5.0,0.0,0.004608,0.101695,0.045315
average,5.0,0.0,0.004608,0.101695,0.045315
basically,4.0,0.0,0.004608,0.084746,0.054378
concept,4.0,0.0,0.004608,0.084746,0.054378
weird,8.0,1.0,0.009217,0.152542,0.06042
against,3.0,0.0,0.004608,0.067797,0.067972
watery,3.0,0.0,0.004608,0.067797,0.067972
reg,3.0,0.0,0.004608,0.067797,0.067972
tortilla,3.0,0.0,0.004608,0.067797,0.067972
homemade,3.0,0.0,0.004608,0.067797,0.067972
