In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import os

**fresh** evaluation: 'fresh' or 'rotten'  
**quote** short version of the review

# Explore and clean the data

In [4]:
tomatoes=pd.read_csv("data/rotten-tomatoes.csv")

In [5]:
tomatoes.shape

(13442, 9)

In [184]:
# a few lines
tomatoes.head(3)

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title,duplicated_review
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story,False
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story,False
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story,False


In [7]:
#print out all the variable names
tomatoes.columns

Index(['critic', 'fresh', 'imdb', 'link', 'publication', 'quote',
       'review_date', 'rtid', 'title'],
      dtype='object')

create a summary table (maybe more like a bullet list) where you print out the most important
summary statistics for the most interesting variables. The most interesting facts you should present
should include:   
a) number of missings for fresh and quote;   

In [8]:
tomatoes.fresh.isnull().sum()

0

In [9]:
tomatoes.quote.isnull().sum()

0

There's no missings for fresh and quote.

In [10]:
# all different values for fresh/rotten evaluations;
tomatoes.fresh.unique()

array(['fresh', 'rotten', 'none'], dtype=object)

In [11]:
# counts or percentages of these values;
tomatoes.fresh.value_counts()

fresh     8389
rotten    5030
none        23
Name: fresh, dtype: int64

In [12]:
tomatoes[(tomatoes.quote=="") | (tomatoes.quote==" ")]

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title


There is no zero-length or only whitespace quote-s.

In [13]:
#minimum-maximum-average length of quotes (either in words, or in characters).
length=tomatoes["quote"].str.len()
length.describe()

count    13442.000000
mean       121.231290
std         57.358899
min          4.000000
25%         76.250000
50%        119.000000
75%        163.000000
max        256.000000
Name: quote, dtype: float64

f) how many reviews are in data multiple times. Feel free to add more
figures you consider relevant.

In [14]:
tomatoes['duplicated_review']=tomatoes.quote.duplicated()

In [15]:
tomatoes[tomatoes['duplicated_review']==True].duplicated_review.count()

606

There are 606 duplicated reviews in data.

Now when you have an overview what you have in data, clean it by removing all the inconsistencies
the table reveals. We have to ensure that the central variables, quote and fresh are not missing,
quote is not an empty string (or just contain spaces and such), and all rows are unique.  
I recommend to do it as a standalone function so you can use the same function for another similar
dataset (such as test data).

In [16]:
nan_value = float("NaN")
tomatoes.replace("", nan_value, inplace=True)
tomatoes.replace("none", nan_value, inplace=True)
tomatoes.replace(" ", nan_value, inplace=True)
tomatoes.dropna(subset=['fresh','quote'],inplace=True)
tomatoes.drop(tomatoes[tomatoes['duplicated_review']==True].index,inplace=True)

In [17]:
def clean(tomatoes,fresh,quote):
    nan_value = float("NaN")
    tomatoes.replace("", nan_value, inplace=True)
    tomatoes.replace("none", nan_value, inplace=True)
    tomatoes.replace(" ", nan_value, inplace=True)
    tomatoes.dropna(subset=[fresh,quote],inplace=True)
    tomatoes['duplicated_review']=tomatoes.quote.duplicated()
    tomatoes.drop(tomatoes[tomatoes['duplicated_review']==True].index,inplace=True)

# Naïve Bayes
Convert your data (quotes) into bag-of-words.

In [18]:
vectorizer = CountVectorizer(binary=True)
# define vectorizer
X = vectorizer.fit_transform(tomatoes.quote.values)
# vectorize your data. Note: this creates a sparce matrix,
# use .toarray() if you want a dense matrix.
words = vectorizer.get_feature_names()
# in case you want to see what are the actual words

In [19]:
X=X.toarray()

In [20]:
X.shape

(12813, 20875)

In [21]:
XT=X.T
XT.shape

(20875, 12813)

In [22]:
df=[]
for i in range(XT.shape[0]):
    df.append(sum(XT[i]))

In [23]:
bow=pd.DataFrame(XT)
bow.insert (0, 'df', df)
bow.insert (0, 'w', words)

In [24]:
bow

Unnamed: 0,w,df,0,1,2,3,4,5,6,7,...,12803,12804,12805,12806,12807,12808,12809,12810,12811,12812
0,000,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0014,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,007,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,044,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,07,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20870,zowie,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20871,zucker,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20872,zweibel,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20873,zwick,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


3.Split your work data and target (i.e. the variable fresh) into training and validation chunks (80/20
or so).

In [25]:
from sklearn.model_selection import train_test_split

In [197]:
Y = (tomatoes.fresh == 'fresh').values.astype(np.int)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,test_size=0.2)
# verify shapes
print (X_train.shape, Y_train.shape)
print ( X_test.shape, Y_test.shape)

(8200, 20875) (8200,)
(2563, 20875) (2563,)


**Use only training data when doing the fitting below.**  
4.Compute the unconditional (log) probability that the tomato is fresh/rotten, $log Pr(F)$, and $log Pr(R)$.  
These probabilities are based on the values of fresh alone, not on the words the quotes contain.

unconditional(log) probablity of the training data

In [29]:
PrF=np.log(sum(Y_train)/len(Y_train))
PrR=np.log(1-sum(Y_train)/len(Y_train))

In [30]:
print ("logPr(F) is", end=" ")
print (PrF)
print ("logPr(R) is", end=" ")
print (PrR)

logPr(F) is -0.48150364805012397
logPr(R) is -0.9619516509007229


unconditional(log) probablity of the original dataset 

In [31]:
tomatoes.groupby('fresh').quote.count()

fresh
fresh     7946
rotten    4867
Name: quote, dtype: int64

In [32]:
lf=np.log(7946/(7946+4867))
lr=np.log(4867/(7946+4867))
print(lf)
print(lr)

-0.477791623135032
-0.967982549676122


5.For each word w, compute $log Pr(w|F)$ and $log Pr(w|R)$, the (log) probability that the word is present
in a fresh/rotten review. These probabilities can easily be calculated from counts of how many times
these words are present for each class.  
Hint: these computations are based on your BOW-s X. Look at ways to sum along columns in this
matrix.

In [33]:
len(Y_train)

10250

In [34]:
X_train.shape

(10250, 20875)

In [35]:
bow_train=pd.DataFrame(X_train)
bow_train['target']=Y_train.T

In [36]:
bow_count=bow_train.groupby('target').count()
bow_count

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20865,20866,20867,20868,20869,20870,20871,20872,20873,20874
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3917,3917,3917,3917,3917,3917,3917,3917,3917,3917,...,3917,3917,3917,3917,3917,3917,3917,3917,3917,3917
1,6333,6333,6333,6333,6333,6333,6333,6333,6333,6333,...,6333,6333,6333,6333,6333,6333,6333,6333,6333,6333


In [37]:
bow_sum=bow_train.groupby('target').sum()
bow_sum

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20865,20866,20867,20868,20869,20870,20871,20872,20873,20874
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,0,2,1,0,13,3,0,0,0,...,1,0,0,0,0,0,1,1,1,1
1,2,1,7,0,1,12,6,1,0,0,...,0,1,0,3,1,1,1,0,2,0


In [38]:
bow_Train=pd.DataFrame(X_train)
bow_train=bow_Train.T

In [208]:
bow_Pr=np.log(bow_sum/bow_count)

  """Entry point for launching an IPython kernel.


In [190]:

bow_pr=bow_Pr.copy()
bow_pr.reset_index(level=0, inplace=True)
bow_pr=bow_pr.T

  """Entry point for launching an IPython kernel.


In [40]:
bow_train.insert (0, 'Pr(w|F)', bow_pr[1])
bow_train.insert (0, 'Pr(w|R)', bow_pr[0])
bow_train.insert (0, 'w', words)

In [41]:
bow_train

Unnamed: 0,w,Pr(w|R),Pr(w|F),0,1,2,3,4,5,6,...,10240,10241,10242,10243,10244,10245,10246,10247,10248,10249
0,000,-7.174469,-8.060382,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0014,-inf,-8.753529,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,007,-7.579934,-6.807619,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,044,-8.273081,-inf,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,07,-inf,-8.753529,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20870,zowie,-inf,-8.753529,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20871,zucker,-8.273081,-8.753529,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20872,zweibel,-8.273081,-inf,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20873,zwick,-8.273081,-8.060382,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Let's now turn to prediction, and pull out your validation data (not the test data!).**

6. For both destination classes, F and R, compute the log-likelihood that the quote belongs to this class.
log-likelihood is what is given inside the brackets in equation (1) on slide 28, and the equations on
Schutt "Doing Data Science", page 102. In lecture notes it is explained before the email classification
example (and in the example too). On the slides we have the log-likelihood essentially as (although
we do not write it out):  
$l_i(c)=logPr(c)+\sum_jlogPr(w_{ij}|c)$  
where $c\in{{F,R}}$ is the class, i is the review, j indexes words, and wij is the j-th word of the review
i.
Computing these likelihoods involves sums of the previously computed probabilities, $log(Pr(w|F))$,
and BOW elements $x_{ij}$. Check out np.apply_along_axis that can be used to apply a function on
matrix columns/rows so you can create a fairly good one-liner to compute log-likelihood. Loops are
fine too if apply seems too complex, just slower and less compact.
Based on the log-likelihoods, predict the class F or R for each quote in the validation set.
7. Print the resulting confusion matrix and accuracy (feel free to use existing libraries).

In [218]:
bow_Pr=bow_pr.drop('target', axis=0)

In [221]:
from numpy import inf
bow_Pr[bow_Pr.values == -inf] = 1

In [230]:
bow_Pr=bow_Pr.T

In [241]:
def loglikelihood_fresh(f):
    sum=0
    for i in range(bow_Pr.shape[1]):
        if f[i] !=0:
            sum += bow_Pr.iloc[1,i]
    return sum + lf;
def loglikelihood_rotten(f):
    sum=0
    for i in range(bow_Pr.shape[1]):
        if f[i] !=0:
            sum += bow_Pr.iloc[0,i]
    return sum + lr;

In [243]:
LL_fresh=np.apply_along_axis(loglikelihood_fresh,1,X_val)

In [242]:
LL_rotten=np.apply_along_axis(loglikelihood_rotten,1,X_val)
LL_rotten

array([ -37.58683628,  -38.11816509,  -15.12785086, ..., -114.84495759,
        -93.043824  , -111.09518067])

In [244]:
predict=[]
for i in range(len(LL_rotten)):
    if LL_fresh[i]<LL_rotten[i]:
        predict.append(0)
    else: predict.append(1)

In [245]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
results = confusion_matrix(Y_val,predict) 
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(Y_val,predict))

Confusion Matrix :
[[251 511]
 [999 289]]
Accuracy Score : 0.2634146341463415


# Interpretation
Now it is time to look at your fitted model a little bit closer. NB model probabilities are rather easy to
understand and interpret. The task here is to find the best words to predict a fresh, and a rotten review.
And we only want to look at words that are reasonably frequent, say more frequent than 30 times in the
data.  
1. Extract from your conditional probability vectors log Pr(w|F) and log Pr(w|R) the probabilities that
correspond to frequent words only.  
2. Find 10 best words to predict F and 10 best words to predict R. Hint: imagine we have a review that
contains just a single word. Which word will give the highest weight to the probability the review is
fresh? Which one to the likelihood it is rotten?  
Comment your results.   
3. Print out a few missclassified quotes. Can you understand why these are misclassified?

<font color=red>approach applied according to the instructions</font>

In [118]:
#filter frequent words from the original dataset bow
bow_fre=bow[bow.df>=30]

In [119]:
#get the index of these words 
index=bow_fre.index

In [126]:
#extract the conditional probabtility 
bow30=bow_train.iloc[index]

In [129]:
print ("the 10 best words to predict F/: ")
print (bow30.sort_values(by=['Pr(w|F)'],ascending=False).head(10).w)

the 10 best words to predict F/: 
18549     the
836       and
12650      of
9817       is
18805      to
9835       it
9272       in
18546    that
20576    with
6898     film
Name: w, dtype: object


In [130]:
print ("the 10 best words to predict R: ")
print (bow30.sort_values(by=['Pr(w|R)'],ascending=False).head(10).w)

the 10 best words to predict R: 
4751     delight
18549        the
836          and
12650         of
18805         to
9817          is
9835          it
9272          in
18546       that
2568         but
Name: w, dtype: object


<font color=red>We cannot see anything from the results above. These all are the most frequent word used in English sentences. Maybe the "but" in the 10 best words to predict rotten could have some universal meaning when people want to express something not up to their expectations.  
So let's see another approach below.</font>

In [131]:
def make_xy(critics, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tomatoes.quote)
    X = X.tocsc()  # some versions of sklearn return COO format
    Y = (tomatoes.fresh == 'fresh').values.astype(np.int)
    return X, Y

X, Y = make_xy(tomatoes.quote)

In [132]:
# get top 10 "totten" words
vectorizer = CountVectorizer(min_df = .0001)       

# build bag-of-words matrix
X, Y = make_xy(tomatoes, vectorizer)


fitted_model = MultinomialNB(alpha=1, class_prior=None, fit_prior=True)
fitted_model.fit(X, Y)
probs = fitted_model.predict_proba(np.identity(X.shape[1]))

# split into rotten and fresh vectors of probabilities
rotten_probs = probs[:,0]
fresh_probs = probs[:,1]

# get the indices of the 10 words associated with rotten that have the highest probabilities
rotten_feat_idx = np.argpartition(rotten_probs, -10, axis = 0)[-10:]

rotten_names = np.array(vectorizer.get_feature_names())
rotten_names = rotten_names[rotten_feat_idx]
rotten_probs = np.array(rotten_probs[rotten_feat_idx])

# create a dataframe with top ten rotten words and probability from above arrays
rotten10 = pd.DataFrame({'Top Rotten Words': rotten_names, 'Rotten Probability': rotten_probs})
rotten10.sort_values(['Rotten Probability'], ascending = 0, inplace = True)
rotten10.reset_index(inplace = True, drop = True)

print(rotten10)


  Top Rotten Words  Rotten Probability
0             lame            0.961741
1        pointless            0.956965
2       uninspired            0.955097
3    unfortunately            0.954101
4            sadly            0.942648
5         tiresome            0.942648
6  disappointingly            0.931204
7         sluggish            0.926302
8            tepid            0.926302
9          witless            0.926302


In [135]:
# get top 10 "fresh" words
fresh_feat_idx = np.argpartition(fresh_probs, -10, axis = 0)[-10:]

fresh_names = np.array(vectorizer.get_feature_names())
fresh_names = fresh_names[fresh_feat_idx]
fresh_probs = np.array(fresh_probs[fresh_feat_idx])

# create a dataframe with top 10 fresh words and probability from above arrays
fresh10 = pd.DataFrame({'Top Fresh Words': fresh_names, 'Fresh Probability': fresh_probs})
fresh10.sort_values(['Fresh Probability'], ascending = 0, inplace = True)
fresh10.reset_index(inplace = True, drop = True)

print(fresh10)

  Top Fresh Words  Fresh Probability
0         delight           0.976400
1        intimate           0.962767
2        captures           0.959660
3       childhood           0.957903
4            myth           0.957903
5         rousing           0.953887
6       seductive           0.951578
7        expertly           0.949025
8      remarkable           0.947645
9     astonishing           0.946188


this method is much more reasonable. We can see bad words in rotten and good words in fresh.

# NB with smoothing
So, now you have your brand-new NB algorithm up and running. As a next step, we add smoothing to it.
As our task is to find the best smoothing parameter below, your first task is to mold what you did above
into two functions: one for fitting and another one for predicting.  
1. Create two functions: one for fitting NB model, and another to predict outcome based on the fitted
model.  
As mentioned above, the model is fully described with 4 probabilities, so your fitting function may
return such a list as the model; and the prediction function may take it as an input.  
2. Add smoothing to the model. See Schutt p 103 and 109. Smoothing amounts to assuming that we
have "seen" every possible word $\alpha$ > 0 times already, for both classes. (If you wish, you can also
assume you have seen the words $\alpha$ times for F and $\beta$ times for R). Note that $\alpha$ does not have to be
an integer, and typically the best $\alpha$ < 1.  
3. Cross-validate the accuracy (on the validation data) on a number of $\alpha$ values and find the $\alpha$ that
gives you the best result. You can use your own CV algorithm you created for PS4, or an existing
library.

In [None]:
from sklearn.model_selection import cross_validate

In [250]:
import sklearn.naive_bayes as nb

# fit naive bayes model on the training set
model = nb.MultinomialNB()
model.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [251]:
predbow=(model.predict(X_test))
prebowprob=model.predict_proba(X_test)

In [254]:
print(classification_report(Y_test ,predbow))

              precision    recall  f1-score   support

           0       0.75      0.56      0.64       986
           1       0.76      0.88      0.82      1577

    accuracy                           0.76      2563
   macro avg       0.76      0.72      0.73      2563
weighted avg       0.76      0.76      0.75      2563



In [256]:
print("Accuracy is", round(accuracy_score(Y_test,predbow),3))

Accuracy is 0.76


In [246]:
def find_optimal_k(X_train,Y_train,mylist):
    cv_scores=[]
    
    X_tr, X_cv, y_tr, y_cv =train_test_split(X_train, Y_train, test_size=0.2)
    for i in mylist:
        nb = MultinomialNB(alpha=i)
        model = nb.fit(X_tr,y_tr)
        pred=model.predict(X_cv)
        acc=accuracy_score(y_cv,pred,normalize=True)
        cv_scores.append(acc)
    MSE=[1-x for x in cv_scores]
    optimal_alpha=mylist[MSE.index(min(MSE))]
    print ('\n The optimal alpha is ',end='')
    print(optimal_alpha)
    
    print("the misclassification error for each k value is: ", end="")
    print(np.round(MSE,3))
    return optimal_alpha

In [261]:
mylist = np.arange(0.00001, 0.001, 0.00005) 
optimal_alpha=find_optimal_k(X_train, Y_train, mylist)


 The optimal alpha is 0.00051
the misclassification error for each k value is: [0.312 0.311 0.312 0.312 0.312 0.311 0.311 0.311 0.311 0.31  0.31  0.31
 0.31  0.311 0.31  0.31  0.31  0.31  0.31  0.31 ]
