In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from scipy import stats

### Examining the data

In [3]:
data=pd.read_csv("rotten-tomatoes.csv.bz2")

In [271]:
#Top 10 rows in the dataset
data.head(10)

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story
3,Leonard Klady,fresh,114709,http://www.variety.com/review/VE1117941294.htm...,Variety,The film sports a provocative and appealing st...,2008-06-09 00:00:00,9559,Toy Story
4,Jonathan Rosenbaum,fresh,114709,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10 00:00:00,9559,Toy Story
5,Michael Booth,fresh,114709,http://www.denverpost.com/movies/ci_5786068,Denver Post,"As Lion King did before it, Toy Story revived ...",2007-05-03 00:00:00,9559,Toy Story
6,Geoff Andrew,fresh,114709,http://www.timeout.com/film/reviews/79673/toy_...,Time Out,The film will probably be more fully appreciat...,2006-06-24 00:00:00,9559,Toy Story
7,Janet Maslin,fresh,114709,http://movies.nytimes.com/movie/review?res=990...,New York Times,Children will enjoy a new take on the irresist...,2003-05-20 00:00:00,9559,Toy Story
8,Kenneth Turan,fresh,114709,http://www.calendarlive.com/movies/reviews/cl-...,Los Angeles Times,Although its computer-generated imagery is imp...,2001-02-13 00:00:00,9559,Toy Story
9,Roger Ebert,fresh,114709,http://www.rogerebert.com/reviews/toy-story-1995,Chicago Sun-Times,The result is a visionary roller-coaster ride ...,2000-01-01 00:00:00,9559,Toy Story


In [272]:
#Bottom 10 rows in the dataset
data.tail(10)

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
13432,Rita Kempley,rotten,104389,http://www.washingtonpost.com/wp-srv/style/lon...,Washington Post,This anti-feminist parable is both a labor and...,2000-01-01 00:00:00,14013,The Hand That Rocks the Cradle
13433,Richard Corliss,fresh,88683,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,"All three stars do smart, honorable work.",2013-05-08 00:00:00,11917,Agnes of God
13434,Kevin Thomas,rotten,88683,http://articles.latimes.com/1985-09-13/enterta...,Los Angeles Times,While Agnes of God has been considerably opene...,2013-05-08 00:00:00,11917,Agnes of God
13435,Dave Kehr,rotten,88683,http://www.chicagoreader.com/chicago/agnes-of-...,Chicago Reader,"Despite all the anguished huffing and puffing,...",2013-05-08 00:00:00,11917,Agnes of God
13436,Jay Boyar,fresh,88683,http://articles.orlandosentinel.com/1985-10-30...,Orlando Sentinel,It is Meg Tilly who makes the movie live. Her ...,2013-05-08 00:00:00,11917,Agnes of God
13437,Gene Siskel,rotten,88683,http://articles.chicagotribune.com/1985-09-13/...,Chicago Tribune,Agnes of God plays with some challenging ideas...,2013-05-08 00:00:00,11917,Agnes of God
13438,Variety Staff,rotten,88683,http://www.variety.com/review/VE1117796703.htm...,Variety,"Fonda's relentless interrogating, mannered cha...",2008-10-18 00:00:00,11917,Agnes of God
13439,,fresh,88683,http://www.timeout.com/film/reviews/77605/agne...,Time Out,Splendidly shot by Sven Nykvist and with excel...,2006-06-24 00:00:00,11917,Agnes of God
13440,Janet Maslin,rotten,88683,http://movies.nytimes.com/movie/review?res=950...,New York Times,"Miss Tilly makes a radiant Agnes, and Miss Ban...",2003-05-20 00:00:00,11917,Agnes of God
13441,Roger Ebert,rotten,88683,http://www.rogerebert.com/reviews/agnes-of-god...,Chicago Sun-Times,Although the movie deals in the basic material...,2000-01-01 00:00:00,11917,Agnes of God


In [273]:
#Columns in the dataset
print(data.shape, "\n")

#Types of columns
print(data.dtypes)

(13442, 9) 

critic         object
fresh          object
imdb            int64
link           object
publication    object
quote          object
review_date    object
rtid            int64
title          object
dtype: object


In [274]:
#Variables in the dataset
print(data.columns)

Index(['critic', 'fresh', 'imdb', 'link', 'publication', 'quote',
       'review_date', 'rtid', 'title'],
      dtype='object')


### Summary Statistics

#### a) Number of missings for fresh and quote

In [275]:
#Missings in fresh
print("Missing values in fresh:", data.fresh.isna().sum())

#Missings in quote
print("Missing values in quote:", data.quote.isna().sum())


Missing values in fresh: 0
Missing values in quote: 0


#### b) Different values for fresh/rotten evaluations 

In [276]:
#Different values of fresh
data.fresh.unique()

array(['fresh', 'rotten', 'none'], dtype=object)

#### c) Counts and percentages of different fresh values

In [277]:
#Different fresh and rotten values
print("Counts and percentages of different fresh values")
percentages=pd.DataFrame(data.groupby('fresh')['fresh'].count())
percentages.columns=['count']
percentages['percentage']=percentages.apply(lambda count: 100*count/float(count.sum()))
percentages

Counts and percentages of different fresh values


Unnamed: 0_level_0,count,percentage
fresh,Unnamed: 1_level_1,Unnamed: 2_level_1
fresh,8389,62.408868
none,23,0.171105
rotten,5030,37.420027


#### d) number of zero-length or only whitespace quote-s

In [278]:
#Number of zero-length value

print('Number of zero-length values', (data.quote.apply(lambda x : len(x)) == 0).sum())


#Number of white space values
quotes=pd.DataFrame(data['quote'])
quotes['Length']=quotes['quote'].str.len()


print("Length of quotes")
print(quotes.head(5))

print("Num of white space quotes")
print(quotes['quote'].str.isspace().sum())





Number of zero-length values 0
Length of quotes
                                               quote  Length
0  So ingenious in concept, design and execution ...     137
1                  The year's most inventive comedy.      33
2  A winning animated feature that has something ...      79
3  The film sports a provocative and appealing st...     107
4  An entertaining computer-generated, hyperreali...     110
Num of white space quotes
0


#### e) Minimum-maximum-average length of quotes (either in words, or in characters)

In [279]:
#Maximum length of quotes(in characters)
print("Maximum number of characters : ",quotes['Length'].max())

#Minimum length of quotes(in characters)
print("Minimum number of characters : ", quotes['Length'].min())

#Average number of characters
avg=round(sum(quotes['Length'])/len(quotes['Length']))
print("Average number of characters : ", avg)

Maximum number of characters :  256
Minimum number of characters :  4
Average number of characters :  121


In [280]:
#Maximum length of quotes(in words)
split_quotes_max = len(max(quotes["quote"].str.split(" ").to_list(), key=len))
print("Maximum length in terms of words:", split_quotes_max)


#Minimum length of quotes(in words)
split_quotes_min = len(min(quotes["quote"].str.split(" ").to_list(), key=len))
print("Minimum length in terms of words:", split_quotes_min)



#Average number of words
print("Average length in terms of words:", 
      round(sum(map(len,quotes["quote"].str.split(" ") ))/len(quotes["quote"].str.split(" "))))

Maximum length in terms of words: 49
Minimum length in terms of words: 1
Average length in terms of words: 20


#### f)  How many reviews are in data multiple times

In [281]:
mult_reviews = (data.groupby('quote', as_index = False).size().
                sort_values(ascending = False).reset_index(name = 'count'))

In [282]:
mult_reviews.columns

Index(['quote', 'count'], dtype='object')

In [283]:
mult_reviews.groupby('count', as_index = False).size().sort_values().reset_index(name = 'counts')

Unnamed: 0,count,counts
0,4,1
1,3,45
2,2,513
3,1,12277


In [4]:
data = data[data['fresh'] != 'none']
data = data.drop_duplicates()
data = data.reset_index()
data = data.drop(columns = 'index')
data.head()

Unnamed: 0,critic,fresh,imdb,link,publication,quote,review_date,rtid,title
0,Derek Adams,fresh,114709,http://www.timeout.com/film/reviews/87745/toy-...,Time Out,"So ingenious in concept, design and execution ...",2009-10-04 00:00:00,9559,Toy Story
1,Richard Corliss,fresh,114709,"http://www.time.com/time/magazine/article/0,91...",TIME Magazine,The year's most inventive comedy.,2008-08-31 00:00:00,9559,Toy Story
2,David Ansen,fresh,114709,http://www.newsweek.com/id/104199,Newsweek,A winning animated feature that has something ...,2008-08-18 00:00:00,9559,Toy Story
3,Leonard Klady,fresh,114709,http://www.variety.com/review/VE1117941294.htm...,Variety,The film sports a provocative and appealing st...,2008-06-09 00:00:00,9559,Toy Story
4,Jonathan Rosenbaum,fresh,114709,http://onfilm.chicagoreader.com/movies/capsule...,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10 00:00:00,9559,Toy Story


### Data Cleaning

In [285]:
#Dropping NA values if any in the fresh and quote column
def CheckInconsistency (dataset,variable1, variable2):
    # check if variable 1 and variable 2 are not missing.
    #print(variable1)
    if ((dataset[variable1].isna().sum()>0 ) | (dataset[variable1].isnull().sum()>0) |(dataset[variable2].isna().sum()>0 )|(dataset[variable2].isnull().sum()>0)):
        print('There are inconsistencies. hence dropping the null and NA values.')
        dataset=dataset[variable1].dropna()
        dataset=dataset[variable2].dropna()
    else:
        print('There are no inconsistencies in '+ variable1 + ' and ' + variable2 + ' columns')
    
    #check if quote is not an empty string
    if ((dataset[variable1].empty == True) | ((dataset[variable1]== '').sum()>0) | (dataset[variable1].astype(str).str.isspace().sum())):
        print('There are empty syring values ')
        
        a=dataset[dataset[variable1]==''].index
        dataset.drop(dataset.index(a))
    else:
        print('There are no empty values ')
    
    #Drop the duplicate values
    
    # keep first duplicate row
    dataset = dataset.drop_duplicates()

In [192]:
def text_preprocessing(train):
    train['quote'] = (
    train['quote'].apply(lambda x: " ".join(x.lower() for x in x.split()))) # everything to lowercase
    train['quote'] = train['quote'].str.replace('[^\w\s]','') # Removes Punctuation
    # Remove the numbers
    train['quote'] = train['quote'].str.replace('\d+', '')
    # Remove stop words
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    train['quote'] = train['quote'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    # Stemming
#     from nltk.stem import PorterStemmer
#     def stem_sentences(sentence):
#         stemmer = PorterStemmer()
#         tokens = sentence.split()
#         stemmed_tokens = [stemmer.stem(token) for token in tokens]
#         return ' '.join(stemmed_tokens)
#   train['quote'] = train['quote'].apply(stem_sentences)
    return train

In [5]:
import re
from nltk.corpus import stopwords

def quote_to_words(raw_quote):
    #removing raw letters,numbers,punctuations
    letters = re.sub("[^a-zA-Z]"," ",raw_quote)
    #creating an array , resolving whitespaces
    words = letters.lower().split()
    #create an array of stopwords so that we don't have to access corpus to search for a stopword
    stop = set(stopwords.words("english"))
    #removing stopwords from the raw_review
    meaningful_words = [w for w in words if w not in stop]
    #return a string with only the words that are important
    return(" ".join(meaningful_words))

In [287]:
CheckInconsistency(data, 'fresh' , 'quote')

There are no inconsistencies in fresh and quote columns
There are no empty values 


In [6]:
clean_quotes = []

for i in range(data.quote.size):
    clean_quotes.append(quote_to_words(data.quote[i]))

In [7]:
quotes_series = pd.Series(clean_quotes)

#Creating work data
work_data = pd.DataFrame(quotes_series)

#Combining the fresh and quotes columns
work_data.rename(columns = {0 : 'quote'}, inplace = True)

#Appending the fresh column
work_data['fresh'] = data.fresh

In [8]:
#Number of quotes
work_data.shape

(12823, 2)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary = True)
X = vectorizer.fit_transform(clean_quotes).toarray()
X.shape


y = data[['fresh']] # Target
words = vectorizer.get_feature_names()

#Number of unique words
len(set(words))


20492

In [10]:
#Splitting the data into train and test models

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)



In [11]:
prob_fresh = y_train.fresh[y_train.fresh == 'fresh'].count()/len(y_train)
prob_rotten = 1 - prob_fresh

log_prob_fresh = np.log(prob_fresh)
log_prob_rotten = np.log(prob_rotten)
print(log_prob_fresh)
print(log_prob_rotten)

-0.48165241862569685
-0.961711165656882


In [12]:
y_train.reset_index(inplace= True)
y_train.drop(columns = 'index', inplace = True)


#Calculating the probabilities of words given fresh

index_fresh = y_train[y_train['fresh'] == 'fresh'].index
X_train_fresh = X_train[index_fresh].sum(axis = 0)
prob_w_f = X_train_fresh/len(y_train[y_train['fresh'] == 'fresh'])


#Cal
index_rotten = y_train[y_train['fresh'] == 'rotten'].index
X_train_rotten = X_train[index_rotten].sum(axis = 0)
prob_w_r = X_train_rotten/len(y_train[y_train['fresh'] == 'rotten'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
fresh = pd.DataFrame(pd.Series(words)).reset_index()
fresh.drop(columns = 'index', inplace = True)
fresh['prob_w_f'] = prob_w_f

rotten = pd.DataFrame(pd.Series(words)).reset_index()
rotten.drop(columns = 'index', inplace = True)
rotten['prob_w_r'] = prob_w_r

fresh.rename(columns = {0 : 'words', 'prob_w_f' : 'Pr(W|F)'}, inplace = True)
rotten.rename(columns = {0 : 'words', 'prob_w_r' : 'Pr(W|R)'}, inplace = True)

In [15]:
fresh.sort_values(by = 'Pr(W|F)', ascending  = False).head(10)

Unnamed: 0,words,Pr(W|F)
6621,film,0.163011
11727,movie,0.13003
12396,one,0.088685
7642,good,0.046868
1593,best,0.045763
17325,story,0.044501
10372,like,0.043869
18399,time,0.037084
19986,well,0.03661
4843,director,0.035032


In [16]:
rotten.sort_values(by = 'Pr(W|R)', ascending  = False).head(10)

Unnamed: 0,words,Pr(W|R)
11727,movie,0.145116
6621,film,0.118847
12396,one,0.071155
10372,like,0.070645
11747,much,0.053558
17325,story,0.042846
3300,comedy,0.041826
5997,even,0.041571
7642,good,0.041061
4843,director,0.03494


In [17]:
print(rotten.isna().sum())
print(fresh.isna().sum())

words      0
Pr(W|R)    0
dtype: int64
words      0
Pr(W|F)    0
dtype: int64


## Taking the columns that only interests us

## Sample Analysis

In [20]:
#Log likelihood of Pr(W|F)
fresh['log_Pr_W_F']=fresh['Pr(W|F)'].apply(lambda x: np.log(x))

#Dropping the infinite values from the log probabilities
#c['log_Pr_W_F']=c['log_Pr_W_F'].replace([np.inf, -np.inf], np.nan).dropna()


#Log likelihood of Pr(W|R)
rotten['log_Pr_W_R']=rotten['Pr(W|R)'].apply(lambda x: np.log(x))

In [22]:
fresh = fresh.replace(to_replace = [np.inf, -np.inf], value = [np.nan, np.nan]).dropna()
fresh.head()

Unnamed: 0,words,Pr(W|F),log_Pr_W_F
0,aaron,0.000473,-7.655548
1,abandon,0.000158,-8.754161
2,abandoned,0.000158,-8.754161
3,abandonment,0.000158,-8.754161
5,abbott,0.000316,-8.061014


In [23]:
rotten = rotten.replace(to_replace = [np.inf, -np.inf], value = [np.nan, np.nan]).dropna()

In [316]:
len(rotten)

11303

In [317]:
len(fresh)

14151

### Fresh Log Likelihood

In [27]:
X_test_df = pd.DataFrame(X_test, columns = words)
test_words_df = pd.DataFrame(X_test, columns = words)
test_words_df.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zoom,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
fresh_T =  fresh.T
fresh_T.columns = fresh_T.iloc[0]
fresh_T.drop(['words', 'Pr(W|F)'], inplace = True)

In [521]:
fresh_T

words,aaron,abandon,abandoned,abandonment,abbott,abbreviated,abduct,abe,abiding,abilities,...,zonca,zone,zoological,zooming,zooms,zorro,zorros,zowie,zucker,zwick
log_Pr_W_F,-7.65555,-8.75416,-8.75416,-8.75416,-8.06101,-8.75416,-8.75416,-8.06101,-8.75416,-8.06101,...,-8.75416,-7.65555,-8.75416,-8.75416,-8.75416,-8.06101,-8.75416,-8.75416,-8.06101,-8.06101


In [29]:
log_word_fresh_list = fresh_T.loc['log_Pr_W_F']

In [31]:
log_word_fresh = pd.DataFrame(columns = test_words_df.columns)

In [559]:
log_word_fresh.append(log_word_fresh_list)

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zoom,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz
log_Pr_W_F,-7.655548,-8.754161,-8.754161,-8.754161,,-8.061014,-8.754161,,-8.754161,-8.061014,...,,-8.754161,-8.754161,-8.061014,-8.754161,-8.754161,-8.061014,,-8.061014,


In [32]:
log_word_fresh_list = fresh_T.loc['log_Pr_W_F']
log_word_fresh = pd.DataFrame(columns = test_words_df.columns)
log_word_fresh = log_word_fresh.append(log_word_fresh_list)
log_word_fresh.replace(np.nan, 0, inplace = True)

test_words_df['log_fresh_pred'] = test_words_df.apply(lambda x : x* log_word_fresh.loc['log_Pr_W_F'] ,axis = 1).sum(axis = 1)
test_words_df.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zooming,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz,log_fresh_pred
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-57.60707
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-84.184092
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-61.345003
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-99.249476
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-102.630409


In [33]:
rotten_T =  rotten.T
rotten_T.columns = rotten_T.iloc[0]
rotten_T.drop(['words', 'Pr(W|R)'], inplace = True)
rotten_T

words,abandon,abandons,abdominal,abets,ability,able,abominable,abomination,abortion,abounds,...,zingy,zinnemann,zipper,zippo,zombie,zone,zoom,zweibel,zwick,zzzzzzzzz
log_Pr_W_R,-7.58095,-8.2741,-8.2741,-7.58095,-6.07688,-5.7892,-8.2741,-8.2741,-8.2741,-8.2741,...,-8.2741,-8.2741,-8.2741,-8.2741,-6.32819,-6.07688,-8.2741,-8.2741,-8.2741,-8.2741


In [34]:
log_word_rotten_list = rotten_T.loc['log_Pr_W_R']
log_word_rotten = pd.DataFrame(columns = test_words_df.columns)
log_word_rotten = log_word_rotten.append(log_word_rotten_list)
log_word_rotten.replace(np.nan, 0, inplace = True)

test_words_df['log_rotten_pred'] = (test_words_df.
                                    apply(lambda x : x* log_word_rotten.loc['log_Pr_W_R'] ,axis = 1).sum(axis = 1))
test_words_df.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zooms,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz,log_fresh_pred,log_rotten_pred
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-57.60707,-57.186279
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-84.184092,-80.254668
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-61.345003,-56.089703
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-99.249476,-104.273027
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-102.630409,-101.680242


In [35]:
test_words_df['y_pred'] = np.where((test_words_df['log_fresh_pred'] > test_words_df['log_rotten_pred']), 'fresh', 'rotten')
test_words_df.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz,log_fresh_pred,log_rotten_pred,y_pred
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-57.60707,-57.186279,rotten
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-84.184092,-80.254668,rotten
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-61.345003,-56.089703,rotten
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-99.249476,-104.273027,fresh
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-102.630409,-101.680242,rotten


In [574]:
y_test.head()

Unnamed: 0,fresh
7386,fresh
10086,fresh
3037,fresh
3026,fresh
2692,rotten


In [38]:
from sklearn.metrics import confusion_matrix

conf = confusion_matrix(y_test['fresh'], test_words_df['y_pred'])
conf

array([[639, 979],
       [339, 608]], dtype=int64)

In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print("Accuracy:", round(accuracy_score(y_test['fresh'], test_words_df['y_pred']),2))

Accuracy: 0.49


In [41]:
#Precision
print("Precision:", round(608/(608+979), 2))

Precision: 0.38


In [42]:
#Recall
print("Recall:", round(608/(608+339), 2))

Recall: 0.64


### Frequent Words

In [43]:
test_words = test_words_df.drop(columns = ['log_fresh_pred', 'log_rotten_pred', 'y_pred'])
freq_df = pd.DataFrame(test_words.sum().sort_values(ascending = False), columns = ['freq']).reset_index()
freq_df.shape

(20492, 2)

In [44]:
pd.merge(freq_df, fresh, left_on = 'index', right_on = 'words').head(10)

Unnamed: 0,index,freq,words,Pr(W|F),log_Pr_W_F
0,film,398,film,0.163011,-1.813938
1,movie,354,movie,0.13003,-2.03999
2,one,217,one,0.088685,-2.422659
3,like,149,like,0.043869,-3.12654
4,story,119,story,0.044501,-3.112254
5,much,112,much,0.028089,-3.572377
6,good,104,good,0.046868,-3.060429
7,comedy,97,comedy,0.034559,-3.365089
8,even,94,even,0.034401,-3.369666
9,director,90,director,0.035032,-3.351483


In [45]:
pd.merge(freq_df, rotten, left_on = 'index', right_on = 'words').head(10)

Unnamed: 0,index,freq,words,Pr(W|R),log_Pr_W_R
0,film,398,film,0.118847,-2.129916
1,movie,354,movie,0.145116,-1.930222
2,one,217,one,0.071155,-2.64289
3,like,149,like,0.070645,-2.650084
4,story,119,story,0.042846,-3.150138
5,much,112,much,0.053558,-2.926994
6,good,104,good,0.041061,-3.192698
7,comedy,97,comedy,0.041826,-3.174236
8,even,94,even,0.041571,-3.180352
9,director,90,director,0.03494,-3.354121


In [46]:
test_fresh = (test_words_df[test_words_df['y_pred'] == 'fresh'].
              drop(columns = ['log_fresh_pred', 'log_rotten_pred', 'y_pred']))
test_fresh.sum().sort_values(ascending = False).head(10)

film      165
movie     133
one        88
like       52
much       42
comedy     40
best       40
story      40
even       39
good       36
dtype: int64

In [47]:
test_rotten = (test_words_df[test_words_df['y_pred'] == 'rotten'].
              drop(columns = ['log_fresh_pred', 'log_rotten_pred', 'y_pred']))
test_rotten.sum().sort_values(ascending = False).head(10)

film        233
movie       221
one         129
like         97
story        79
much         70
good         68
comedy       57
time         56
director     55
dtype: int64

In [48]:
test_words_df.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz,log_fresh_pred,log_rotten_pred,y_pred
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-57.60707,-57.186279,rotten
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-84.184092,-80.254668,rotten
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-61.345003,-56.089703,rotten
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-99.249476,-104.273027,fresh
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-102.630409,-101.680242,rotten


In [49]:
y_test1 = y_test.reset_index()

In [50]:
fresh_index = test_words_df.y_pred[(test_words_df.y_pred == 'fresh')].index

In [51]:
fresh_index.shape

(978,)

In [52]:
y_test1.drop(columns = 'index', inplace = True)

In [54]:
misclass = test_words_df[y_test1.fresh != test_words_df.y_pred]

In [55]:
misclass.head()

Unnamed: 0,aaron,abandon,abandoned,abandonment,abandons,abbott,abbreviated,abdominal,abduct,abe,...,zorro,zorros,zowie,zucker,zweibel,zwick,zzzzzzzzz,log_fresh_pred,log_rotten_pred,y_pred
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-57.60707,-57.186279,rotten
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-84.184092,-80.254668,rotten
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-61.345003,-56.089703,rotten
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-74.949268,-69.152084,rotten
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-78.23316,-73.761263,rotten


In [673]:
y_test.head(12)

Unnamed: 0,fresh
7386,fresh
10086,fresh
3037,fresh
3026,fresh
2692,rotten
10207,fresh
11847,fresh
3404,fresh
3299,rotten
3082,fresh


In [58]:
#List of misclassified quotes
list1 = [7386, 10086, 3037, 2252, 4148]
for i in list1:
    print(i, "", clean_quotes[i])

7386  dark funny paranoid arbitrary humming tamped eroticism love things weird good news
10086  miller outstanding father utterly convinced dating first step towards instant complete moral sexual decay
3037  tandem director screenwriter build palpable suspense boy rivet raising issues forgiveness deserves
2252  splashy volatile crowd pleasing rock star melodrama makes sheer emotional wallop sometimes lacks finesse
4148  mr black screenplay mean spirited earns keep sharp sarcastic dialogue ingenious ways setting story


In [59]:
test_words_df.loc[[0,1,2,10,11]][['log_fresh_pred', 'log_rotten_pred']]

Unnamed: 0,log_fresh_pred,log_rotten_pred
0,-57.60707,-57.186279
1,-84.184092,-80.254668
2,-61.345003,-56.089703
10,-74.949268,-69.152084
11,-78.23316,-73.761263


#### On observig the log likelihoods of these misclassified words for fresh and rotten, we observe that they are more or less same and there's not much difference. Since it is observed that the log likelihood of these words is more for rotten , that's why the model has predicted it as rotten

###  NB with smoothing

####  Create two functions: one for fitting NB model, and another to predict outcome based on the tted model. As mentioned above, the model is fully described with 4 probabilities, so your fitting function may return such a list as the model; and the prediction function may take it as an input.

In [64]:
#Function to fit NB model which outputs the 4 probabilites : Fresh , rotten , word|fresh, word|rotten
def NB_fit(X_train,y_train, alpha):

    #Probabilities of fresh with alpha values
    prob_fresh = (y_train.fresh[y_train.fresh == 'fresh'].count()+ alpha)/(len(y_train)+alpha)
    #Probabilities of rotten
    prob_rotten = (y_train.fresh[y_train.fresh == 'rotten'].count()+alpha)/(len(y_train)+alpha)
    
    #Log probabilities of fresh
    log_prob_fresh = np.log(prob_fresh)
    
    #log probabilities of rotten
    log_prob_rotten = np.log(prob_rotten)
    
    X_train_fresh = X_train[index_fresh].sum(axis = 0)
    #Probability of word given fresh
    prob_w_f = (X_train_fresh+alpha)/(len(y_train[y_train['fresh'] == 'fresh'])+alpha)
    
    X_train_rotten = X_train[index_rotten].sum(axis = 0)
    #Probability of word given rotten
    prob_w_r = (X_train_rotten+alpha)/(len(y_train[y_train['fresh'] == 'rotten'])+alpha)
    
    fresh = pd.DataFrame(pd.Series(words)).reset_index()
    fresh.drop(columns = 'index', inplace = True)
    fresh['prob_w_f'] = prob_w_f

    rotten = pd.DataFrame(pd.Series(words)).reset_index()
    rotten.drop(columns = 'index', inplace = True)
    rotten['prob_w_r'] = prob_w_r

    fresh.rename(columns = {0 : 'words', 'prob_w_f' : 'Pr(W|F)'}, inplace = True)
    rotten.rename(columns = {0 : 'words', 'prob_w_r' : 'Pr(W|R)'}, inplace = True)
    
    fresh['log_Pr_W_F']=fresh['Pr(W|F)'].apply(lambda x: np.log(x))
    rotten['log_Pr_W_R']=rotten['Pr(W|R)'].apply(lambda x: np.log(x))
    fresh = fresh.replace(to_replace = [np.inf, -np.inf], value = [np.nan, np.nan]).dropna()
    rotten = rotten.replace(to_replace = [np.inf, -np.inf], value = [np.nan, np.nan]).dropna()
    fresh_T =  fresh.T
    fresh_T.columns = fresh_T.iloc[0]
    fresh_T.drop(['words', 'Pr(W|F)'], inplace = True)
    
    rotten_T =  rotten.T
    rotten_T.columns = rotten_T.iloc[0]
    rotten_T.drop(['words', 'Pr(W|R)'], inplace = True)
    
    log_word_fresh_list = fresh_T.loc['log_Pr_W_F']
    log_word_fresh = pd.DataFrame(columns = test_words_df.columns)
    log_word_fresh = log_word_fresh.append(log_word_fresh_list)
    
    
    log_word_rotten_list = rotten_T.loc['log_Pr_W_R']
    log_word_rotten = pd.DataFrame(columns = test_words_df.columns)
    log_word_rotten = log_word_rotten.append(log_word_rotten_list)
    
    log_word_rotten.replace(np.nan, 0, inplace = True)
    log_word_fresh.replace(np.nan, 0, inplace = True)
    
    
    return log_prob_fresh, log_prob_rotten, log_word_rotten, log_word_fresh

In [65]:
NB_fit(X_train, y_train, 0.01)

(-0.48165181544159524,
 -0.9617095901387539,
                 aaron  abandon  abandoned  abandonment  abandons     abbott  \
 log_Pr_W_R -12.879275 -7.57597 -12.879275   -12.879275 -8.264154 -12.879275   
 
             abbreviated  abdominal     abduct        abe  ...      zorro  \
 log_Pr_W_R   -12.879275  -8.264154 -12.879275 -12.879275  ... -12.879275   
 
                zorros      zowie     zucker   zweibel     zwick  zzzzzzzzz  \
 log_Pr_W_R -12.879275 -12.879275 -12.879275 -8.264154 -8.264154  -8.264154   
 
             log_fresh_pred  log_rotten_pred  y_pred  
 log_Pr_W_R             0.0              0.0     0.0  
 
 [1 rows x 20495 columns],
                aaron   abandon  abandoned  abandonment   abandons    abbott  \
 log_Pr_W_F -7.652222 -8.744212  -8.744212    -8.744212 -13.359333 -8.056028   
 
             abbreviated  abdominal    abduct       abe  ...     zorro  \
 log_Pr_W_F    -8.744212 -13.359333 -8.744212 -8.056028  ... -8.056028   
 
               zorros     