In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

---
Naive Bayes Classification - Predicting Movie Quotes
=====
***

---
##Count Vectorization
###Obtaining word frequencies
---

In [2]:
text = ['Math is great', 'Math is really great', 'Exciting exciting Math']

In [3]:
for i, sentence in enumerate(text):
    print "{:d} = {:s}".format(i, sentence)

0 = Math is great
1 = Math is really great
2 = Exciting exciting Math


In [4]:
ngr = (1, 1)
myCVa = CountVectorizer(ngram_range = ngr)

#####As per usual you fit the model first

In [5]:
myCVa.fit(text)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
transf_text = myCVa.transform(text)

#####The use the transform method

In [7]:
for i, fn in enumerate(myCVa.get_feature_names()):
    print "{:2d} = {:s}".format(i, fn)

 0 = exciting
 1 = great
 2 = is
 3 = math
 4 = really


#####The transform method returns a sparse matrix

In [8]:
print transf_text
#(A, B)  C
#A refers to the text number. There were 3 sentences so you would expect 0, 1, 2
#The 0 text is "Math is great"
#B refers to the feature. There are 5 features so you would expect 0, 1, 2, 3, 4
#The 1 feature is 'great', 2 is 'is', 3 is 'math'
#C is the frequency of the word in the text
#There are 2 instances of 'exciting' in the 3rd sentence

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 4)	1
  (2, 0)	2
  (2, 3)	1


#####Which you can visualize as a dense matrix
#####There are 3 sentences and 5 features

In [9]:
print transf_text.todense()
print transf_text.todense().shape

[[0 1 1 1 0]
 [0 1 1 1 1]
 [2 0 0 1 0]]
(3, 5)


---
##What is an ngram?
---

#####Set the begining n-gram length to 1
#####Set the end n-gram length to 2

In [10]:
ngr = (1, 2)
myCVb = CountVectorizer(ngram_range = ngr)
myCVb.fit(text)
transformed_text = myCVb.transform(text)

#####Now look at the features

In [11]:
for i, fn in enumerate(myCVb.get_feature_names()):
    print "{:2d} = {:s}".format(i, fn)

 0 = exciting
 1 = exciting exciting
 2 = exciting math
 3 = great
 4 = is
 5 = is great
 6 = is really
 7 = math
 8 = math is
 9 = really
10 = really great


In [12]:
for i, sentence in enumerate(text):
    print "{:d} = {:s}".format(i, sentence)

0 = Math is great
1 = Math is really great
2 = Exciting exciting Math


In [13]:
print transformed_text

  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 7)	1
  (0, 8)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (2, 0)	2
  (2, 1)	1
  (2, 2)	1
  (2, 7)	1


In [14]:
print transformed_text.shape
print type(transformed_text)

(3, 11)
<class 'scipy.sparse.csr.csr_matrix'>


In [15]:
dense_array = transformed_text.toarray()
print dense_array.shape
print dense_array

(3, 11)
[[0 0 0 1 1 1 0 1 1 0 0]
 [0 0 0 1 1 0 1 1 1 1 1]
 [2 1 1 0 0 0 0 1 0 0 0]]


#####Understanding the sparse array

In [16]:
for i in xrange(3):
    for j in xrange(11):
        if dense_array[i][j] == 1:
            print i,j

0 3
0 4
0 5
0 7
0 8
1 3
1 4
1 6
1 7
1 8
1 9
1 10
2 1
2 2
2 7


---
Questions:
===
1. What does 1 10 represent?
 
2. What does 0 8 represent?

---
The Movie Database
=====
***

In [17]:
critics_data = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/rt_critics.csv")

In [18]:
for i in critics_data.columns:
    print type(i), i

<type 'str'> critic
<type 'str'> fresh
<type 'str'> imdb
<type 'str'> publication
<type 'str'> quote
<type 'str'> review_date
<type 'str'> rtid
<type 'str'> title


In [19]:
critics_data.head(5)

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
1,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
2,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
3,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
4,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [20]:
#TODO Determine what values 'fresh' can take in the database

In [21]:
terms = []
for t in critics_data.fresh:
    if t not in terms:
        terms.append(t)
print terms

['fresh', 'rotten', 'none']


In [22]:
critics_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14072 entries, 0 to 14071
Data columns (total 8 columns):
critic         13382 non-null object
fresh          14072 non-null object
imdb           14072 non-null float64
publication    14072 non-null object
quote          14072 non-null object
review_date    14072 non-null object
rtid           14072 non-null float64
title          14072 non-null object
dtypes: float64(2), object(6)
memory usage: 989.4+ KB


In [23]:
dc = critics_data[['fresh', 'quote']]

In [24]:
dc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14072 entries, 0 to 14071
Data columns (total 2 columns):
fresh    14072 non-null object
quote    14072 non-null object
dtypes: object(2)
memory usage: 329.8+ KB


In [25]:
terms = []
for t in dc.fresh:
    if t not in terms:
        terms.append(t)
print terms

['fresh', 'rotten', 'none']


In [26]:
#TODO: Create a new data frame that contains only records where 'fresh' has values 'fresh' and 'rotten'

In [27]:
fdc = dc[(dc.fresh == 'rotten') | (dc.fresh == 'fresh')]

In [28]:
terms = []
for t in fdc.fresh:
    if t not in terms:
        terms.append(t)
print terms

['fresh', 'rotten']


In [29]:
fdc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14049 entries, 0 to 14071
Data columns (total 2 columns):
fresh    14049 non-null object
quote    14049 non-null object
dtypes: object(2)
memory usage: 329.3+ KB


In [30]:
#TODO: Create a 'y' variable which is the factorization of the 'fresh' column, so 'fresh' gets converted to 0 (or 1)
#and 'rotten' gets converted to 1 (or 0)

In [31]:
y = pd.factorize(fdc.fresh)[0]
y

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
#TODO: How many of each class ('fresh' and 'rotten') do you have?

In [33]:
print fdc.fresh[fdc.fresh=='fresh'].count()
print fdc.fresh[fdc.fresh=='rotten'].count()

8613
5436


In [34]:
print fdc.quote[fdc.fresh == 'fresh'][0:5], '\n'
print fdc.quote[fdc.fresh == 'rotten'][0:5]

0    So ingenious in concept, design and execution ...
1                    The year's most inventive comedy.
2    A winning animated feature that has something ...
3    The film sports a provocative and appealing st...
4    An entertaining computer-generated, hyperreali...
Name: quote, dtype: object 

19    A gloomy special-effects extravaganza filled w...
22                               Mediocre, regrettably.
25    The movie is too pat and practiced to really b...
27    Never escapes the queasy aura of Melrose Place...
29    You want the movie to stomp and rejoice and cr...
Name: quote, dtype: object


In [35]:
#TODO: Now using CountVectorizer fit and transform the quotes, start using a n-gram size of (1-1)
#hint: you can fit and transform in one hit using "fit_transform"

In [36]:
crV = CountVectorizer(ngram_range = (1, 1))
X = crV.fit_transform(fdc.quote)

In [37]:
print X.shape, y.shape

(14049, 21530) (14049,)


##### Here is an example of quote that is associated with the 'fresh' class

In [38]:
test_quote = fdc.quote[fdc.fresh == 'fresh'][50:51].values[0]
print fdc.fresh[fdc.fresh == 'fresh'][50:51].values[0]
print test_quote

fresh
Offers above-average pyrotechnics, a body count that steadily mounts, and plenty of hand-to-hand combat.


##### ...and here is an example of a quote that is associated with the 'rotten' class

In [39]:
test_quote = fdc.quote[fdc.fresh == 'rotten'][50:51].values[0]
fdc.fresh[fdc.fresh == 'rotten'][50:51].values[0]
print test_quote

After coming out gangbusters in its first and finest hour, the 180-minute movie loses all its chips in the remaining two.


In [40]:
#TODO: Using train test split fit a mulitnomialNB model and a BernoulliNB model to the training data
#TODO: What accuracy to do you get on the test set for each model?

In [41]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y)

In [42]:
clfMN = MultinomialNB().fit(xtrain, ytrain)
print clfMN.score(xtest, ytest)

0.777113578138


In [43]:
clfB = BernoulliNB().fit(xtrain, ytrain)
print clfB.score(xtest, ytest)

0.764588670652


In [44]:
#TODO: now test some of your own review language in a very short sentence and see if your review is predicted as
#fresh or rotten
#If you want to use the helper function below
#Remember: use the count vectorizer transform method to convert your quote into a sparse array

In [45]:
def report_results(clf, my_sparse, my_review):
    #get the probabilities from the model
    res_prob = clf.predict_proba(my_sparse).ravel()
    
    #set up the correct output string
    if clf.predict(my_sparse) == 0:
        result = "fresh"
        
    else:
        result = "rotten"
    
    #print the result making sure the correct probability is used
    print "Your review:\n '", my_review, "'", "\n\nhas a {:5.2f}% chance of being classified in the '{:s}' class".\
    format(res_prob[clf.predict(my_sparse)][0] * 100.0, result)
    print "\n"

#####Here are some test quotes to start you off testing your models
#####Try individual words, try phrases - explore the space a bit!

In [46]:
test_quotes = ["This was an awesome movie", \
              "This movie was so self indulgant that it really couldn't get over itself", \
              "So ingenious in concept", \
              "A gloomy special-effects extravaganza filled"]

In [47]:
for test_quote in test_quotes:
    my_sparse_matrix = crV.transform([test_quote])
    report_results(clfMN, my_sparse_matrix, test_quote)

Your review:
 ' This was an awesome movie ' 

has a 67.55% chance of being classified in the 'fresh' class


Your review:
 ' This movie was so self indulgant that it really couldn't get over itself ' 

has a 92.19% chance of being classified in the 'rotten' class


Your review:
 ' So ingenious in concept ' 

has a 64.56% chance of being classified in the 'fresh' class


Your review:
 ' A gloomy special-effects extravaganza filled ' 

has a 77.74% chance of being classified in the 'rotten' class




In [48]:
#TODO: print out a pd crosstab for the predictors using the test set

In [49]:
pd.crosstab(ytest, clfMN.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1861,290
1,493,869


In [50]:
pd.crosstab(ytest, clfB.predict(xtest), rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1895,256
1,571,791


---
Questions:
===
1. n-gram settings gives the best results - 1, 2, or 3?