In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split

---
Naive Bayes Classification - Predicting Movie Quotes
=====
***

---
##Count Vectorization
###Obtaining word frequencies
---

In [2]:
text = ['Math is great', 'Math is really great', 'Exciting exciting Math']

In [3]:
for i, sentence in enumerate(text):
    print "{:d} = {:s}".format(i, sentence)

In [4]:
ngr = (1, 1)
myCVa = CountVectorizer(ngram_range = ngr)

#####As per usual you fit the model first

In [5]:
myCVa.fit(text)

In [6]:
transf_text = myCVa.transform(text)

#####Then use the transform method

In [7]:
for i, fn in enumerate(myCVa.get_feature_names()):
    print "{:2d} = {:s}".format(i, fn)

#####The transform method returns a sparse matrix

In [8]:
print transf_text
#(A, B)  C
#A refers to the text number. There were 3 sentences so you would expect 0, 1, 2
#The 0 text is "Math is great"
#B refers to the feature. There are 5 features so you would expect 0, 1, 2, 3, 4
#The 1 feature is 'great', 2 is 'is', 3 is 'math'
#C is the frequency of the word in the text
#There are 2 instances of 'exciting' in the 3rd sentence

#####Which you can visualize as a dense matrix
#####There are 3 sentences and 5 features

In [9]:
print transf_text.todense()
print transf_text.todense().shape

---
##What is an ngram?
---

#####Set the begining n-gram length to 1
#####Set the end n-gram length to 2

In [10]:
ngr = (1, 2)
myCVb = CountVectorizer(ngram_range = ngr)
myCVb.fit(text)
transformed_text = myCVb.transform(text)

#####Now look at the features

In [11]:
for i, fn in enumerate(myCVb.get_feature_names()):
    print "{:2d} = {:s}".format(i, fn)

In [12]:
for i, sentence in enumerate(text):
    print "{:d} = {:s}".format(i, sentence)

In [13]:
print transformed_text

In [14]:
print transformed_text.shape
print type(transformed_text)

In [15]:
dense_array = transformed_text.toarray()
print dense_array.shape
print dense_array

#####Understanding the sparse array

In [16]:
for i in xrange(3):
    for j in xrange(11):
        if dense_array[i][j] == 1:
            print i,j

---
Questions:
===
1. What does 1 10 represent?
 
2. What does 0 8 represent?

---
The Movie Database
=====
***

In [17]:
critics_data = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/rt_critics.csv")

In [18]:
for i in critics_data.columns:
    print type(i), i

In [19]:
critics_data.head(5)

In [20]:
#TODO Determine what values 'fresh' can take in the database

In [26]:
#TODO: Create a new data frame that contains only records where 'fresh' has values 'fresh' and 'rotten'

In [30]:
#TODO: Create a 'y' variable which is the factorization of the 'fresh' column, so 'fresh' gets converted to 0 (or 1)
#and 'rotten' gets converted to 1 (or 0)

In [32]:
#TODO: How many of each class ('fresh' and 'rotten') do you have?

In [35]:
#TODO: Now using CountVectorizer fit and transform the quotes, start using a n-gram size of (1-1)
#hint: you can fit and transform in one hit using "fit_transform"

##### Here is an example of quote that is associated with the 'fresh' class

In [38]:
test_quote = fdc.quote[fdc.fresh == 'fresh'][50:51].values[0]
print fdc.fresh[fdc.fresh == 'fresh'][50:51].values[0]
print test_quote

##### ...and here is an example of a quote that is associated with the 'rotten' class

In [39]:
test_quote = fdc.quote[fdc.fresh == 'rotten'][50:51].values[0]
fdc.fresh[fdc.fresh == 'rotten'][50:51].values[0]
print test_quote

In [40]:
#TODO: Using train test split fit a mulitnomialNB model and a BernoulliNB model to the training data
#TODO: What accuracy to do you get on the test set for each model?

In [44]:
#TODO: now test some of your own review language in a very short sentence and see if your review is predicted as
#fresh or rotten
#If you want to use the helper function below
#Remember: use the count vectorizer transform method to convert your quote into a sparse array

In [45]:
def report_results(clf, my_sparse, my_review):
    #get the probabilities from the model
    res_prob = clf.predict_proba(my_sparse).ravel()
    
    #set up the correct output string
    if clf.predict(my_sparse) == 0:
        result = "fresh"
        
    else:
        result = "rotten"
    
    #print the result making sure the correct probability is used
    print "Your review:\n '", my_review, "'", "\n\nhas a {:5.2f}% chance of being classified in the '{:s}' class".\
    format(res_prob[clf.predict(my_sparse)][0] * 100.0, result)
    print "\n"

#####Here are some test quotes to start you off testing your models
#####Try individual words, try phrases - explore the space a bit!

In [46]:
test_quotes = ["This was an awesome movie", \
              "This movie was so self indulgant that it really couldn't get over itself", \
              "So ingenious in concept", \
              "A gloomy special-effects extravaganza filled"]

In [48]:
#TODO: print out a pd crosstab for the predictors using the test set

---
Questions:
===
1. n-gram settings gives the best results - 1, 2, or 3?