In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../notebook_format')
from formats import load_style
load_style()

In [20]:
os.chdir(path)
import pandas as pd
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Tutorial Exercise: Yelp reviews

## Introduction

This exercise uses a small subset of the data from Kaggle's [Yelp Business Rating Prediction](https://www.kaggle.com/c/yelp-recsys-2013) competition.

**Description of the data:**

- **`yelp.csv`** contains the dataset. It is stored in the repository (in the **`data`** directory), so there is no need to download anything from the Kaggle website.
- Each observation (row) in this dataset is a review of a particular business by a particular user.
- The **stars** column is the number of stars (1 through 5) assigned by the reviewer to the business. (Higher stars is better.) In other words, it is the rating of the business by the person who wrote the review.
- The **text** column is the text of the review.

**Goal:** Our goal is to see if we can predict the star rating of a review by using **only** the review text. For simplicity, we'll re-framed it as a **binary classification problem** by only considering the 5-star and 1-star reviews.

## Getting Started

In [23]:
yelp = pd.read_csv( 'data/yelp.csv' )
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [25]:
data = yelp[ yelp['stars'].isin( ( 1, 5 ) ) ]
X = data['text']
y = data['stars']

# calculate the baseline to see if our classifaction model is worth the effort, 
# which is the classification accuracy that 
# could be achieved by always predicting the most frequent class.
print( y.value_counts() )

# train a classification model and
# predict the star rating for the reviews in the testing set, 
# and then calculate the accuracy and print the confusion matrix
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.25, random_state = 1 )
print( 'training data size: {}'.format(X_train.shape) )
print( 'testing data size: {}'.format(X_test.shape) )

# convert to document-term matrix
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
test_dtm  = vect.transform(X_test)

# train a naive bayes model
nb = MultinomialNB()
nb.fit( train_dtm, y_train )
y_pred = nb.predict(test_dtm)

5    3337
1     749
Name: stars, dtype: int64
training data size: (3064,)
testing data size: (1022,)


In [33]:
print( metrics.accuracy_score( y_test, y_pred ) )
print( metrics.confusion_matrix( y_test, y_pred ) )

0.918786692759
[[126  58]
 [ 25 813]]


## TODO

Browse through the review text of some of the **false positives** and **false negatives**. Based on your knowledge of how Naive Bayes works, do you have any ideas about why the model is incorrectly classifying these reviews?

- **Hint:** [Evaluating a classification model](https://github.com/justmarkham/scikit-learn-videos/blob/master/09_classification_metrics.ipynb) explains the definitions of "false positives" and "false negatives".
- **Hint:** Think about what a false positive means in this context, and what a false negative means in this context. What has scikit-learn defined as the "positive class"?


Calculate which 10 tokens are the most predictive of **5-star reviews**, and which 10 tokens are the most predictive of **1-star reviews**.

- **Hint:** Naive Bayes automatically counts the number of times each token appears in each class, as well as the number of observations in each class. You can access these counts via the `feature_count_` and `class_count_` attributes of the Naive Bayes model object.