In [6]:
##https://github.com/dualphase90/pycon-2016-tutorial/blob/master/exercise_solution.ipynb

# read yelp.csv using a relative path
import pandas as pd
path = 'yelp.csv'
yelp = pd.read_csv(path)

# examine the shape
yelp.shape

(10000, 10)

In [7]:
# examine the first row
yelp.head(1)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0


In [8]:
# examine the class distribution
yelp.stars.value_counts().sort_index()

1     749
2     927
3    1461
4    3526
5    3337
Name: stars, dtype: int64

In [16]:
yelp.useful.head(10)

0    5
1    0
2    1
3    2
4    0
5    3
6    7
7    1
8    0
9    1
Name: useful, dtype: int64

In [17]:

# filter the DataFrame using an OR condition
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# equivalently, use the 'loc' method
yelp_best_worst = yelp.loc[(yelp.stars==5) | (yelp.stars==1), :]



In [18]:


# examine the shape
yelp_best_worst.shape



(4086, 10)

In [19]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [20]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [21]:


# examine the object shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)



(3064,)
(1022,)
(3064,)
(1022,)


In [22]:
# import and instantiate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [23]:
# fit and transform X_train into X_train_dtm
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(3064, 16825)

In [24]:
# transform X_test into X_test_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm.shape

(1022, 16825)

In [25]:
# import and instantiate MultinomialNB
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [27]:
# train the model using X_train_dtm
nb.fit(X_train_dtm, y_train)

# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [28]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.91878669275929548

In [29]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[126,  58],
       [ 25, 813]])

In [30]:
# examine the class distribution of the testing set
y_test.value_counts()

5    838
1    184
Name: stars, dtype: int64

In [31]:


# calculate null accuracy
y_test.value_counts().head(1) / y_test.shape



5    0.819961
Name: stars, dtype: float64

In [32]:


# calculate null accuracy manually
838 / float(838 + 184)



0.8199608610567515

In [38]:
# first 10 false positives (1-star reviews incorrectly classified as 5-star reviews)
X_test[y_test < y_pred_class].head(10)

2175    This has to be the worst restaurant in terms o...
1781    If you like the stuck up Scottsdale vibe this ...
2674    I'm sorry to be what seems to be the lone one ...
9984    Went last night to Whore Foods to get basics t...
3392    I found Lisa G's while driving through phoenix...
8283    Don't know where I should start. Grand opening...
2765    Went last week, and ordered a dozen variety. I...
2839    Never Again,\nI brought my Mountain Bike in (w...
321     My wife and I live around the corner, hadn't e...
1919                                         D-scust-ing.
Name: text, dtype: object

In [39]:
# false positive: model is reacting to the words "good", "impressive", "nice"
X_test[1781]

"If you like the stuck up Scottsdale vibe this is a good place for you. The food isn't impressive. Nice outdoor seating."

In [40]:
# false positive: model does not have enough data to work with
X_test[1919]

'D-scust-ing.'

In [41]:
# first 10 false negatives (5-star reviews incorrectly classified as 1-star reviews)
X_test[y_test > y_pred_class].head(10)

7148    I now consider myself an Arizonian. If you dri...
4963    This is by far my favourite department store, ...
6318    Since I have ranted recently on poor customer ...
380     This is a must try for any Mani Pedi fan. I us...
5565    I`ve had work done by this shop a few times th...
3448    I was there last week with my sisters and whil...
6050    I went to sears today to check on a layaway th...
2504    I've passed by prestige nails in walmart 100s ...
2475    This place is so great! I am a nanny and had t...
241     I was sad to come back to lai lai's and they n...
Name: text, dtype: object

In [44]:
X_test[7148]
# false negative: model is reacting to the words "complain", "crowds", "rushing", "pricey", "scum"
#X_test[4963]



"I now consider myself an Arizonian. If you drive a lot on the 101 or 51 like I do, you'll get your fair share of chips on your windshield. You'll also have to replace a windshield like I had to do just recently. Apparently, chips and cracking windshields  is common in Arizona. In fact, I seem to recall my insurance agent telling me that insurance companies must provide this coverage in Arizona.\n\nI had a chip repaired about a year ago near the very bottom of the windshield. Just recently a small, very fine crack started traveling north on the windshield from the repaired chip (a different vendor repaired the chip). I called these guys over to my house and they said it was too long to fix, so they replaced the whole windshield the next day.\n\nWhat great service, they come out to your residence or place of business to repair or replace your windshield."

In [45]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)



16825

In [46]:
# first row is one-star reviews, second row is five-star reviews
nb.feature_count_.shape

(2, 16825)

In [49]:
# store the number of times each token appears across each class
one_star_token_count = nb.feature_count_[0, :]
five_star_token_count = nb.feature_count_[1, :]

In [50]:
# create a DataFrame of tokens with their separate one-star and five-star counts
tokens = pd.DataFrame({'token':X_train_tokens, 'one_star':one_star_token_count, 'five_star':five_star_token_count}).set_index('token')

In [51]:
# add 1 to one-star and five-star counts to avoid dividing by 0
tokens['one_star'] = tokens.one_star + 1
tokens['five_star'] = tokens.five_star + 1

