In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import os

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report

**fresh** evaluation: 'fresh' or 'rotten'  
**quote** short version of the review

### 1 Explore and clean the data

#### 1-1 Load the data

In [2]:
tomatoes=pd.read_csv("rotten-tomatoes.csv.bz2")

#### 1-2 Check the data

In [3]:
tomatoes.shape

(13442, 9)

In [4]:
# a few lines
tomatoes.sample(3)

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
2592,Roger Ebert,fresh,107688,http://www.rogerebert.com/reviews/tim-burtons-...,Chicago Sun-Times,"Working with gifted artists and designers, [Bu...",2000-01-01 00:00:00,12490,The Nightmare Before Christmas
13054,Susan Stark,fresh,120910,http://data.detnews.com/movies/details.hbs?myr...,Detroit News,Those who get to know the spectacular IMAX for...,2000-01-01 00:00:00,9488,Fantasia 2000
9058,Pat Graham,rotten,92537,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"Hard to say what this is really, except maybe ...",2000-01-01 00:00:00,12163,The Allnighter


In [7]:
#print out all the variable names
tomatoes.columns

Index(['critic', 'fresh', 'imdb', 'link', 'publication', 'quote',
       'review_date', 'rtid', 'title'],
      dtype='object')

#### 1-3 Summary 

In [8]:
# all different values for fresh/rotten evaluations;
tomatoes.fresh.unique()

array(['fresh', 'rotten', 'none'], dtype=object)

In [9]:
# counts or percentages of these values;
tomatoes.fresh.value_counts()

fresh     8389
rotten    5030
none        23
Name: fresh, dtype: int64

In [10]:
# zero-length or whitespace quotes
tomatoes[(tomatoes.quote=="") | (tomatoes.quote==" ")].shape[0]

0

There is no zero-length or only whitespace quote-s.

In [11]:
# missing values
tomatoes.replace("none", np.nan, inplace=True)
print(' Number of missings for fresh:',tomatoes.fresh.isnull().sum(),'\n',
      'Number of missings for quote:',tomatoes.quote.isnull().sum(),'\n')

 Number of missings for fresh: 23 
 Number of missings for quote: 0 



In [12]:
# minimum-maximum-average length of quotes in characters
length=tomatoes["quote"].str.len()
length.describe()[['max','min','mean']]

max     256.00000
min       4.00000
mean    121.23129
Name: quote, dtype: float64

In [13]:
# duplicate review
tomatoes['duplicated_review']=tomatoes.quote.duplicated()

In [14]:
tomatoes[tomatoes['duplicated_review']==True].duplicated_review.count()

606

There are 606 duplicated reviews in data.

#### 1-4 clean the data

In [15]:
def clean(data):
    nan_value = np.nan
    # find out missing values in quote
    data.replace("", nan_value, inplace=True)
    data.replace(" ", nan_value, inplace=True)
    
    # find out missing values in fresh
    data.replace("none", nan_value, inplace=True)

    # clean fresh and quote by dropping N/As
    data.dropna(subset=['fresh','quote'],inplace=True)
    
    # clean duplicated reviews
    data['duplicated_review']=data.quote.duplicated()
    data.drop(data[data['duplicated_review']==True].index,inplace=True)
    
    return data

In [16]:
clean(tomatoes).shape

(12814, 10)

The cleaned dataset have 12814 rows and 10 columns.

### 2 Naïve Bayes
#### 2-2 Convert the data (quotes) into bag-of-words.

In [17]:
vectorizer = CountVectorizer(binary=True)
# define vectorizer
X = vectorizer.fit_transform(tomatoes.quote.values)
X = X.toarray()
# vectorize your data. Note: this creates a sparce matrix,
# use .toarray() if you want a dense matrix.
words = vectorizer.get_feature_names()
# in case you want to see what are the actual words

#### 2-3 Split work data and target (training : validation = 80 : 20)

In [18]:
Y = tomatoes.fresh

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2)
X_tr, X_val, Y_tr, Y_val = train_test_split(X_train, Y_train, test_size=0.2)

In [20]:
# verify shapes
print (' X train shape:',X_tr.shape, '\n',
       'Y train shape:',Y_tr.shape, '\n')
print(' X validation shape:',X_val.shape, '\n',
       'Y validation shape:',Y_val.shape, '\n')
print (' X test shape:',X_test.shape, '\n',
       'Y test shape:',Y_test.shape)

 X train shape: (8200, 20875) 
 Y train shape: (8200,) 

 X validation shape: (2051, 20875) 
 Y validation shape: (2051,) 

 X test shape: (2563, 20875) 
 Y test shape: (2563,)


#### 2-4 Compute unconditional log probability 𝑙𝑜𝑔𝑃𝑟(𝐹)  and 𝑙𝑜𝑔𝑃𝑟(𝑅)

**Note:** Use only training data

In [21]:
prob = Y_tr.value_counts(1)

In [22]:
lf=np.log(prob[0])
lr=np.log(prob[1])

In [23]:
print (' logPr(F):',lf, '\n',
       'logPr(R):',lr, '\n')

 logPr(F): -0.4721522774012126 
 logPr(R): -0.9772584003292383 



#### 2-5 Compute 𝑙𝑜𝑔𝑃𝑟(𝑤|𝐹) and 𝑙𝑜𝑔𝑃𝑟(𝑤|𝑅) for each word

**Note:** Use only training data

In [24]:
# create BOW_train data frame
bow_train = pd.DataFrame(data = X_tr, columns = words)
bow_train['fresh'] = Y_tr

In [25]:
# count the frequency of each words' appearance
words_sum = bow_train.groupby('fresh').sum()
words_sum

Unnamed: 0_level_0,000,0014,007,044,07,10,100,101,104,105,...,zoom,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz
fresh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fresh,2,0,1,0,1,6,5,0,0,0,...,0,0,1,1,0,0,1,0,2,0
rotten,1,0,2,0,0,8,3,0,0,0,...,0,0,0,1,0,1,1,0,0,0


In [33]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
# compute logPr(w|F)
lwf = words_sum[words_sum.index=='fresh']/Y_tr.value_counts()['fresh']
lwf.replace(0,1,inplace=True)
lwf = np.log(lwf)
lwf

Unnamed: 0_level_0,000,0014,007,044,07,10,100,101,104,105,...,zoom,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz
fresh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fresh,-7.84659,0.0,-8.539737,0.0,-8.539737,-6.747978,-6.930299,0.0,0.0,0.0,...,0.0,0.0,-8.539737,-8.539737,0.0,0.0,-8.539737,0.0,-7.84659,0.0


In [27]:
# compute logPr(w|R)
lwr = words_sum[words_sum.index=='rotten']/Y_tr.value_counts()['rotten']
lwr.replace(0,1,inplace=True)
lwr = np.log(lwr)
lwr

Unnamed: 0_level_0,000,0014,007,044,07,10,100,101,104,105,...,zoom,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz
fresh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rotten,-8.034631,0.0,-7.341484,0.0,0.0,-5.955189,-6.936019,0.0,0.0,0.0,...,0.0,0.0,0.0,-8.034631,0.0,-8.034631,-8.034631,0.0,0.0,0.0


#### 2-6 Compute the log-likelihood

In [28]:
# define axis apply function for fresh
def loglikelihood_fresh(f):
    sums = 0
    for i in range(lwf.shape[1]):
        if f[i] !=0:
            sums += lwf.iloc[0,i]
    return sums+lf;        

In [29]:
# define axis apply function for fresh
def loglikelihood_rotten(r):
    sums = 0
    for i in range(lwr.shape[1]):
        if r[i] !=0:
            sums += lwr.iloc[0,i]
    return sums+lr;

In [30]:
# compute log likelihood for records in class fresh
fresh=np.apply_along_axis(loglikelihood_fresh,1,X_val)

In [331]:
fresh

array([ -53.60679477, -127.59789338, -101.35211214, ..., -133.17183549,
        -71.95128122,  -16.73423834])

In [31]:
# compute log likelihood for records in class rotten
rotten=np.apply_along_axis(loglikelihood_rotten,1,X_val)

In [333]:
rotten

array([ -55.16724942, -108.61816289, -104.35028338, ...,  -95.65020302,
        -54.21318299,  -15.9371485 ])

In [34]:
# create log likelihood, class and prediction table

# the loglikelihood for fresh
# and the loglikelihood for rotten
val = pd.DataFrame(X_val, columns=words)
val['fresh'] = fresh
val['rotten'] = rotten

# compute the predicted class
pred = []
for i in range(val.shape[0]):
    if val['fresh'][i] > val['rotten'][i]:
        pred.append('fresh')
    else:
        pred.append('rotten')
val['pred'] = pred

# attached the original class
val2 = val[['pred','fresh','rotten']]
Y_val_array = np.array(Y_val)
val2['real'] = Y_val_array

In [35]:
val2

Unnamed: 0,pred,fresh,rotten,real
0,rotten,-88.251133,-72.697867,rotten
1,rotten,-90.883451,-82.017162,fresh
2,rotten,-129.863535,-124.973495,rotten
3,rotten,-50.150173,-40.159201,rotten
4,rotten,-134.554721,-117.260136,fresh
...,...,...,...,...
2046,fresh,-40.018935,-48.435820,fresh
2047,rotten,-96.701809,-95.845435,rotten
2048,rotten,-95.546421,-73.834061,fresh
2049,rotten,-108.758599,-92.506033,fresh


#### 2-7 Confusion matrix & accuracy score

In [36]:
results = confusion_matrix(val2.real, val2.pred) 
print ('Confusion Matrix:')
print(results) 
print ('Accuracy Score:',accuracy_score(val2.real, val2.pred))

Confusion Matrix:
[[398 881]
 [219 553]]
Accuracy Score: 0.4636762554851292


### 3 Interpretation

#### 3-1 log Pr(w|F) and log Pr(w|R) for more frequent words

3-1-1 Method 1: Count fresh/rotten respectively

In [37]:
words_freq = words_sum.transpose()
words_freq['log Pr(w|F)'] = np.array(lwf).reshape(20874,)
words_freq['log Pr(w|R)'] = np.array(lwr).reshape(20874,)

In [38]:
words_freq_fresh = words_freq[words_freq.fresh > 30]
words_freq_rotten = words_freq[words_freq.rotten > 30]

In [39]:
words_freq_fresh

fresh,fresh.1,rotten,log Pr(w|F),log Pr(w|R)
about,215,110,-3.169099,-3.334151
acting,31,24,-5.105750,-4.856577
action,86,51,-4.085390,-4.102805
actors,39,18,-4.876176,-5.144259
after,32,21,-5.074001,-4.990109
...,...,...,...,...
years,31,14,-5.105750,-5.395574
yet,42,23,-4.802068,-4.899137
you,222,129,-3.137060,-3.174819
young,32,10,-5.074001,-5.732046


In [40]:
words_freq_rotten

fresh,fresh.1,rotten,log Pr(w|F),log Pr(w|R)
about,215,110,-3.169099,-3.334151
action,86,51,-4.085390,-4.102805
all,229,120,-3.106015,-3.247139
almost,43,34,-4.778537,-4.508271
also,64,34,-4.380854,-4.508271
...,...,...,...,...
will,78,48,-4.183028,-4.163430
with,507,274,-2.311226,-2.421503
work,77,37,-4.195932,-4.423713
would,49,41,-4.647917,-4.321059


3-1-2 Count word frequency in the whole dataset

In [41]:
frequency = X_train.sum(axis=0)
freq_index = np.where(frequency>30)[0]
freq_words = np.array(words)[freq_index]
freq_lwf = lwf.iloc[0,freq_index]
freq_lwr = lwr.iloc[0,freq_index]
freq = pd.DataFrame({'logPr(w|F)': freq_lwf,
                    'logPr(w|R)': freq_lwr})

In [42]:
freq

Unnamed: 0,logPr(w|F),logPr(w|R)
about,-3.169099,-3.334151
above,-6.141842,-6.088721
across,-6.141842,-6.425193
act,-5.831687,-8.034631
acted,-5.320861,-6.088721
...,...,...
yelling,0.000000,0.000000
yglesias,0.000000,0.000000
young,-5.074001,-5.732046
younger,-6.930299,0.000000


#### 3-2 10 best words to predict F/R

In [48]:
fresh_w = dict()
rotten_w = dict()

In [49]:
# create a dictionary linking weights and words for two classes
for i in range(len(freq_index)):
    weight = np.abs(freq_lwf[i]-freq_lwr[i])
    if freq_lwf[i] > freq_lwr[i]:
        fresh_w[weight] = freq_words[i]
    else:
        rotten_w[weight] = freq_words[i]

In [50]:
fresh_best10 = [fresh_w[weight] for weight in sorted(fresh_w.keys(), reverse=True)[0:10]]
rotten_best10 = [rotten_w[weight] for weight in sorted(rotten_w.keys(), reverse=True)[0:10]]

In [51]:
fresh_best10

['wrong',
 'strong',
 'some',
 'taste',
 'act',
 'camera',
 'think',
 'turn',
 'cut',
 'need']

In [47]:
rotten_best10

['year',
 'way',
 'wit',
 'truly',
 'young',
 'what',
 'would',
 'wonderful',
 'lot',
 'great']

'wrong' is the word that gives highest weight to fresh, and 'year' is the word that gives highest weight to rotten. 

However, there are some words that seem to be positive(negative) but has a high weight to predict rotten(fresh). For example, "worse" in fresh top 10, and "truly" in rotten top 10.

#### 3-3 Misclassified quotes

In [52]:
misindex = val2[val2['real']!=val2['pred']].index
quoteindex = Y_val.iloc[misindex].index

In [53]:
quote = tomatoes.quote[quoteindex]

In [54]:
quotes10 = pd.DataFrame({'quote':quote,'fresh':Y_val.iloc[misindex]})

In [352]:
pd.options.display.max_colwidth = 1000
quotes10.sample(10)

Unnamed: 0,quote,fresh
7710,"Refusing to condescend to us with the usual cutesy anthropomorphic qualities often foisted on animated animals, Asbury and Cook keep matters realistic.",fresh
12998,"The film works a bit better than the 2004 Punisher installment, the one starring surly, dislikable Thomas Jane as Frank Castle.",rotten
12845,"Flawless is a fictional tale, but something in director Michael Radford's conscientious, methodical presentation gives it the feeling of true history. Watching it is like watching a historical dramatization of something that never really happened.",fresh
383,Will Ferrell has become the most unlikely embodiment of wholesome family fun since Fred MacMurray gave up film noir for My Three Sons.,fresh
12786,"An impressive first feature by writer/director Courtney Hunt, Frozen River boasts considerable suspense-movie tension and a compelling emotional journey for its foreground characters.",fresh
6020,The complications of its story are found in the deep complexities of emotions and family relationships.,fresh
1681,"The Lion King, complete with jaunty songs by Elton John and Tim Rice, is undeniably and fully worthy of its glorious Disney heritage. It is a gorgeous triumph -- one lion in which the studio can take justified pride.",fresh
8338,"The Night Porter is as nasty as it is lubricious, a despicable attempt to titillate us by exploiting memories of persecution and suffering.",rotten
7571,Dream for an Insomniac is really a self-conscious modern sitcom that with its San Francisco setting suggests a pale shadow of Armistead Maupin's Tales From the City.,rotten
12770,"The Fisher King has two actors at the top of their form, and a compelling, well-directed and well-produced story.",fresh


Some possible reasons for misclassification:

(1) Use "double negative" to express positive comments, such as "refuse to condescend". 

(2) Use positive words to demonstrate an ironic criticism, such as "better than ... dislikable"

When words were split and considered independent, these expressions couldn't be interpreted based on context and can easy lead to misclassification.

### 4 NB with Smoothing

#### 4-1 & 4-2 Create two functions + smoothing

1 Fitting and add smoothing

In [55]:
def nbfitting(X_tr,Y_tr,a):
    prob = Y_tr.value_counts(1)
    lf=np.log(prob[0])
    lr=np.log(prob[1])
    
    bow_train = pd.DataFrame(data = X_tr, columns = words)
    bow_train['fresh'] = Y_tr
    words_sum = bow_train.groupby('fresh').sum()

    lwf = (words_sum[words_sum.index=='fresh']+a)/(Y_tr.value_counts()['fresh']+2*a)
    lwf = np.log(lwf)
    
    lwr = (words_sum[words_sum.index=='rotten']+a)/(Y_tr.value_counts()['rotten']+2*a)
    lwr = np.log(lwr)
    
    return lf, lr, lwf, lwr

2 Predicting

In [56]:
def nbpred(X_val,Y_val):
    fresh=np.apply_along_axis(loglikelihood_fresh,1,X_val)
    rotten=np.apply_along_axis(loglikelihood_rotten,1,X_val)
    pred = []
    for i in range(len(fresh)):
        if fresh[i] > rotten[i]:
            pred.append('fresh')
        else:
            pred.append('rotten')
            
    Y_array = np.array(Y_val)
    table = pd.DataFrame({'pred': pred,
                          'real': Y_array})

    score = accuracy_score(table.real, table.pred)
    return score

#### 4-3 Cross Validation

In [63]:
def find_optimal_a(X,Y,aList):
   
    # empty list that will hold cv scores
    cv_scores = []

    # split the train data set into cross validation train and cross validation test
    X_tr, X_cv, y_tr, y_cv = train_test_split(X, Y, test_size=0.3)

    for i in aList:
        # predict the response on the crossvalidation train
        nbfit = nbfitting(X_tr,Y_tr,i)
        lf = nbfit[0]
        lr = nbfit[1]
        lwf = nbfit[2]
        lwr = nbfit[3]
        pred = nbpred(X_cv, y_cv)

        # evaluate CV accuracy
        cv_scores.append(pred)
        
    # changing to misclassification error
    MSE = [1 - x for x in cv_scores]

    # determining best alpha
    
    print("the misclassification error for each alpha value is : ", np.round(MSE,3))
    
    optimal_alpha = aList[MSE.index(min(MSE))]
    print('\nThe optimal alpha is ', optimal_alpha)


In [58]:
aList = np.array([0.01,0.05,0.001,0.005])

In [64]:
find_optimal_a(X_train,Y_train,aList)

the misclassification error for each alpha value is :  [0.545 0.545 0.545 0.545]

The optimal alpha is  0.01
