In [1]:
#This is a practice exercise from the YouTube site "Data School" that gives practice processing data and preparing
#data for machine learning.  Specific focus was on cross-validation train-test-split, converting text into a matrix
#of token counts, creating a document term matrix (dtm), using a Multnomial Naive Bayes algorithm, calculating null
#accuracy, understanding a confusion matrix, and a classification report.

import pandas as pd
yelp = pd.read_csv('/home/rds/AnacondaProjects/dataschool-exercise/pycon-2016-tutorial/data/yelp.csv')
yelp.shape

(10000, 10)

In [2]:
yelp.head(10)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
5,-yxfBYGB6SEqszmxJxd97A,2007-12-13,m2CKSsepBCoRYWxiRUsxAg,4,"Quiessence is, simply put, beautiful. Full wi...",review,sqYN3lNgvPbPCTRsMFu27g,4,3,1
6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4
7,hW0Ne_HTHEAgGF1rAdmR-g,2012-07-12,JL7GXJ9u4YMx7Rzs05NfiQ,4,"Luckily, I didn't have to travel far to make m...",review,1ieuYcKS7zeAv_U15AB13A,0,1,0
8,wNUea3IXZWD63bbOQaOH-g,2012-08-17,XtnfnYmnJYi71yIuGsXIUA,4,Definitely come for Happy hour! Prices are ama...,review,Vh_DlizgGhSqQh4qfZ2h6A,0,0,0
9,nMHhuYan8e3cONo3PornJA,2010-08-11,jJAIXA46pU1swYyRCdfXtQ,5,Nobuo shows his unique talents with everything...,review,sUNkXg8-KFtCMQDV6zRzQg,0,1,0


In [3]:
yelp.stars.value_counts().sort_index()

1     749
2     927
3    1461
4    3526
5    3337
Name: stars, dtype: int64

In [4]:
yelp.dtypes


business_id    object
date           object
review_id      object
stars           int64
text           object
type           object
user_id        object
cool            int64
useful          int64
funny           int64
dtype: object

In [6]:
X = yelp.text
y = yelp.stars

In [9]:
print(X.shape)
print(y.shape)

(10000,)
(10000,)


In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7500,)
(2500,)
(7500,)
(2500,)




In [11]:
# Convert text into a matrix of token counts (i.e. a numerical matrix)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [12]:
# fit and transform the training data, i.e. learn (fit) vocabulary and build (transform) document term matrix
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<7500x25797 sparse matrix of type '<class 'numpy.int64'>'
	with 622700 stored elements in Compressed Sparse Row format>

In [13]:
# 7500 rows and 25797 columns in a sparse matrix
# Transform testing data but do not fit.  Transform uses fitted training data to build a dtm from testing data.
X_test_dtm = vect.transform(X_test)
X_test_dtm

<2500x25797 sparse matrix of type '<class 'numpy.int64'>'
	with 200729 stored elements in Compressed Sparse Row format>

In [14]:
# Testing data has 2500 rows and 25797 columns in a sparse matrix
# Perform Multinomial Naive Bayes analysis (a favourite when working with text data because of many attributes)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [15]:
# Fit the training data, i.e. learn the relationship between X_train_dtm and y_train
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Now make predictions on y_test data
y_pred_class = nb.predict(X_test_dtm)

In [17]:
# Now calculate the accuracy of Multinomial Naive Bayes algorithm
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.47120000000000001

In [18]:
# Calculate the Null accuracy
y_test.value_counts()

4    884
5    832
3    365
2    234
1    185
Name: stars, dtype: int64

In [19]:
len(y_test)

2500

In [20]:
# Null accuracy for non-binary problems is:
y_test.value_counts().head(1) / len(y_test)

4    0.3536
Name: stars, dtype: float64

In [21]:
# Shows that a 4-star review will be predicted ~35% of the time
# So 47% using Naive Bayes is better accuracy
# Print Confusion Matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 55,  14,  24,  65,  27],
       [ 28,  16,  41, 122,  27],
       [  5,   7,  35, 281,  37],
       [  7,   0,  16, 629, 232],
       [  6,   4,   6, 373, 443]])

In [22]:
# The confusion matrix shows the predictions for each type of review using the Multnomial NB algorithm.  The rows
# show actual classes and the columns show predicted classes.  The first row shows 1-star reviews, the second row
# shows 2-star reviews and so on.
# Looking at the first row in detail, the sum of all the numbers in the row is the total number of counts for 
# for those reviews (see the result of y_test.value_counts() above).  In this case the sum is 185.  For row two
# (2-star reviews) the sum is 234 and so on.
# Certain metrics like precision and recall can be determined using this confusion matrix.
# For example, looking at the 4th row (actual 4-star reviews), the total for this row is 884.  4-star reviews were 
# correctly predicted 629 times.  This results in a recall of 629 / 884 = 71% .   The precision is determined using
# the fourth column (predicted 4-star reviews) by 629 / 1470 = 43% , (1470 is the sum of the column numbers).  In 
# other words 629 reviews were correctly predicted as 4-star, but 373 were predicted incorrectly as 5-star, 281 as
# 3-star etc.
# This matrix shows most reviews were classified as 4-star or 5-star reviews.  It looks like 1, 2, and 3-star 
# reviews are classified predominately as 4-star reviews.

In [23]:
# Look at the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

          1       0.54      0.30      0.38       185
          2       0.39      0.07      0.12       234
          3       0.29      0.10      0.14       365
          4       0.43      0.71      0.53       884
          5       0.58      0.53      0.55       832

avg / total       0.46      0.47      0.43      2500



In [24]:
# The precision and recall for 4-star reviews in the classification report agrees with my assessment from the 
# confusion matrix above.
# The f1-score = 2 * Precision * Recall / (Precision + Recall)
# For the 4-star reviews the f1-score would be (using the confusion matrix values): 2 * .43 * .71 / (.43 + .71)=0.53

In [None]:
# Precision answers the question: "When a given class is predicted, how often are those predictions correct?"
# Recall answers the question: "When a given class is the true class, how often is that class predicted?"
# For the case of 1-star reviews the recall is low and the precision is high signifying that the model has a 
# difficult time detecting 1-star reviews, but when they are predicted there is a 54% chance of being correct.