# Naive Bayes Classification with MR Dataset
<hr>

We will build a text classification model using Naive Bayes on the Moview Review Dataset. Since there is no standard train/test split for this dataset, we will use 10-Fold Cross Validation (CV). 

## Load the library

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import random
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import KFold
from nltk.stem import PorterStemmer
from string import punctuation
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
import time

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False
# nltk.download('twitter_samples')


Bad key text.latex.unicode in file C:\Users\Diardano Raihan\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 112 ('text.latex.unicode : False # use "ucs" and "inputenc" LaTeX packages for handling')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.2/matplotlibrc.template
or from the matplotlib source distribution

Bad key savefig.frameon in file C:\Users\Diardano Raihan\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 423 ('savefig.frameon : True')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.2/matplotlibrc.template
or from the matplotlib source distribution

Bad key pgf.debug in file C:\Users\Diardano Raihan\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 444 ('pgf.debug           : False')
You probably need to get an updated matplotlibrc file from
h

## Load the Dataset

In [2]:
corpus = pd.read_pickle('../0_data/MR/MR.pkl')
corpus.label = corpus.label.astype(int)
print(corpus.shape)
corpus

(10662, 3)


Unnamed: 0,sentence,label,split
0,"simplistic , silly and tedious .",0,train
1,"it 's so laddish and juvenile , only teenage b...",0,train
2,exploitative and largely devoid of the depth o...,0,train
3,garbus discards the potential for pathological...,0,train
4,a visually flashy but narratively opaque and e...,0,train
...,...,...,...
10657,both exuberantly romantic and serenely melanch...,1,train
10658,mazel tov to a film about a family 's joyous l...,1,train
10659,standing in the shadows of motown is the best ...,1,train
10660,it 's nice to see piscopo again after all thes...,1,train


In [3]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10662 entries, 0 to 10661
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  10662 non-null  object
 1   label     10662 non-null  int32 
 2   split     10662 non-null  object
dtypes: int32(1), object(2)
memory usage: 208.4+ KB


In [4]:
corpus.groupby( by='label').count()

Unnamed: 0_level_0,sentence,split
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5331,5331
1,5331,5331


In [5]:
# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)

## Raw Number of Vocabulary

In [6]:
# Build the raw vocobulary for first inspection
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_raw = tokenizer.word_index
print('\nThe vocabulary size: {}\n'.format(len(vocab_raw)))
print(vocab_raw)


The vocabulary size: 18758



<!--## Split Dataset-->

# Data Preprocessing
<hr>

## Define `clean_doc` function

In [7]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = PorterStemmer()
    
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

## Define `count_docs` function

In [8]:
def count_docs(data, docs, ys):
    '''
    Input:
        data: a dictionary that will be used to map each pair to its frequency
        docs: a list of sentences
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    onehot = OneHotEncoder( sparse=False)
    y_onehot = onehot.fit_transform(np.reshape(ys, (-1,1)))
    # print(y_onehot)
    count = 0                    
    for doc, y in zip(docs, y_onehot):
        # For each word
        for word in clean_doc(doc):
            # if not in the data yet
            if word not in data:
                # assign it
                data[word] = y
            # if already in the data 
            else:
                # update it
                data[word] = data.get(word) + y

    return data

## Build Frequencies Dictionary

In [9]:
# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)
freqs

{'simplist': array([15.,  4.]),
 'silli': array([64., 20.]),
 'tediou': array([28.,  5.]),
 'laddish': array([1., 0.]),
 'juvenil': array([5., 2.]),
 'teenag': array([14., 11.]),
 'boy': array([36., 36.]),
 'could': array([128.,  94.]),
 'possibl': array([33., 32.]),
 'find': array([70., 74.]),
 'funni': array([110., 185.]),
 'exploit': array([30.,  9.]),
 'larg': array([25., 24.]),
 'devoid': array([16.,  2.]),
 'depth': array([19., 22.]),
 'sophist': array([11., 17.]),
 'would': array([170.,  81.]),
 'make': array([280., 338.]),
 'watch': array([122., 105.]),
 'graphic': array([9., 6.]),
 'treatment': array([ 9., 10.]),
 'crime': array([35., 25.]),
 'bearabl': array([2., 0.]),
 'garbu': array([1., 0.]),
 'discard': array([2., 1.]),
 'potenti': array([24., 16.]),
 'patholog': array([3., 4.]),
 'studi': array([18., 45.]),
 'exhum': array([1., 0.]),
 'instead': array([52., 19.]),
 'skew': array([2., 1.]),
 'melodrama': array([32., 21.]),
 'circumstanti': array([2., 0.]),
 'situat': arra

In [10]:
# convert the freqs dictionary to nested list
def freqs_to_df(freqs, train_y):
    '''
    input:
        freqs: a frequencies dictionary (ex: {'simplist': array([15.,  4.]), 
                                              'silli': array([64., 20.]), . . })
        train_y: labels for data
    output:
        a frequencies dictionary in the form of dataframe
    '''
    # initialize an empty list to store the rows for dataframe
    freqs_list = []
    
    # Define the names of the dataframe columns
    column_names = ['word']
    column_names = column_names + list(np.unique(train_y))
    
    # convert the keys from the freqs dictionary to a list
    keys = list(freqs.keys())

    # For each row
    for i in range(len(freqs)):
        
        # define the elements for each column
        row = [keys[i]] + list(freqs.get(keys[i]))
        
        # update the frequency list
        freqs_list.append(row)
    
    # Create the dataframe
    df = pd.DataFrame(freqs_list, columns=column_names)
    df.set_index('word', inplace=True)
    return df

In [21]:
freqs_df = freqs_to_df(freqs, labels)
print(freqs_df)

             0     1
word                
simplist  15.0   4.0
silli     64.0  20.0
tediou    28.0   5.0
laddish    1.0   0.0
juvenil    5.0   2.0
...        ...   ...
deplet     0.0   1.0
piscopo    0.0   1.0
chaykin    0.0   1.0
headli     0.0   1.0
porthol    0.0   1.0

[12666 rows x 2 columns]


In [22]:
freqs_df[0].sum()

57518.0

# Training and Testing the Model

## Build Training Function

In [13]:
def train_naive_bayes(freq_df, train_x, train_y):
    '''
    Input:
        freqs: a pandas dataframe with word indexing
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    freqs = freq_df

    # calculate V, the number of unique words in the vocabulary
    vocab = list(freqs.index)
    V = len(vocab)
    
    ########################################################################################
    # Part 1: Calculate the log prior probability for each class
    
    # Calculate D, the number of documents
    D = len(train_y)
    
    labels = list(np.unique(train_y.astype(int)))
    count = np.zeros((len(labels),))

    for train_label in train_y:
        for unique_label in labels:
            if train_label == unique_label:
                count[unique_label]+=1
                
    # -> count = [4000, 4000]; it means perfectly balanced between each classese

    # Calculate prior probability for each class
    prior = count/D # -> prior = array([0.5, 0.5])
    
        
    # Calculate the logprior for each class
    logprior = np.log(prior) # -> prior = array([-0.69314718, -0.69314718])
    
    ########################################################################################
    # Part 2.a. Calculate the total number of word occurrences for each class
    
    columns = list(freqs.columns)
    N_classes = []
    # calculate N frequency for each class
    for column in columns:
        
        freqs[column] = (freqs[column] + 1)/(freqs[column].sum()+V)
        
    # Calculate the log likelihood of the word
    loglikelihood = np.log(freqs)
        
    ########################################################################################
        
    return logprior, loglikelihood

In [23]:
labels = np.array(labels)
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
loglikelihood

[-0.69314718 -0.69314718]


Unnamed: 0_level_0,0,1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
simplist,-8.386287,-9.560856
silli,-6.984488,-8.125772
tediou,-7.791580,-9.378535
laddish,-10.465728,-11.170294
juvenil,-9.367116,-10.071682
...,...,...
deplet,-11.158876,-10.477147
piscopo,-11.158876,-10.477147
chaykin,-11.158876,-10.477147
headli,-11.158876,-10.477147


## Build Testing Function

In [24]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    input:
        tweet: a string
        logprior: initial probability based on dataset
        loglikelihood: a dictionary of words mapping to numbers
    output:
        p: the sum of all the loglikelihood of each word in the tweet
        (if found in the dictionary) + logprior (a number)
    '''
    
    # process the tweet to get the list of words
    words = clean_doc(tweet)
    
    
    # Initialize probability to zero
    probs = []
    columns = list(loglikelihood.columns)
    for column in columns:
        prob = 0
        # Iterate for each word in word list
        for word in words:

            # check if the word exist in the loglikelihood dictionary
            if word in loglikelihood.index:
                prob += loglikelihood.loc[word, column]
        
        probs.append(prob)
        
    probs = logprior + probs
    y_hat = np.argmax(probs)
    
    return probs, y_hat

In [25]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    output:
        accuracy: (# of tweets classified correctly)/total # of tweets
    """
    # initial accuracy
    acc = 0
    
    # initialize an empty list for storing the predictions
    y_hats = []

    for tweet in test_x:
        
        _ , y = naive_bayes_predict(tweet, logprior, loglikelihood)
        
#         y_hat = np.argmax(probs)
        
        # update the y_hats
        y_hats.append(y)
        
#     Error: the mean absolute values between y_hats and test_y
    error = np.mean(np.abs(np.array(y_hats)-np.array(test_y)))
        
#     Accuracy is 1 - error
    acc = 1-error
    return acc    

## KFold CV

In [26]:


# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), np.array(list(corpus.label))

# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)

# Turn the frequencies dictionary into dataframe
freqs_df = freqs_to_df(freqs, labels)
print(freqs_df.head())

# Retrieve the logprior and loglikelihood
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
print(loglikelihood.head())

print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(sentences, labels, logprior, loglikelihood)))

             0     1
word                
simplist  15.0   4.0
silli     64.0  20.0
tediou    28.0   5.0
laddish    1.0   0.0
juvenil    5.0   2.0
[-0.69314718 -0.69314718]
                  0          1
word                          
simplist  -8.386287  -9.560856
silli     -6.984488  -8.125772
tediou    -7.791580  -9.378535
laddish  -10.465728 -11.170294
juvenil   -9.367116 -10.071682
Naive Bayes accuracy = 0.8987


In [27]:
# prepare cross validation
kfold = KFold(10, True)

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
# count = 0
acc_list = []
# kfold.split() will return set indices for each split
for train, test in kfold.split(sentences):
    train_x, train_y = [], []
    test_x, test_y = [], []
    for i in train:
        train_x.append(sentences[i])
        train_y.append(labels[i])
        
    for i in test:
        test_x.append(sentences[i])
        test_y.append(labels[i])
    
    train_y = np.array(train_y)
    test_y = np.array(test_y)
    
    # Build the freqs dictionary for later uses
    freqs = count_docs({}, train_x, train_y)

    # Turn the frequencies dictionary into dataframe
    freqs_df = freqs_to_df(freqs, train_y)
#     print(freqs_df.head())

    # Retrieve the logprior and loglikelihood
    logprior, loglikelihood = train_naive_bayes(freqs_df, train_x, train_y)
    print('loglikelihood:\n{}'.format(loglikelihood.head()))
    print('logprior: {}'.format(logprior))
    
    acc = test_naive_bayes(test_x, test_y, logprior, loglikelihood)
    print("Naive Bayes test accuracy = %0.4f\n" %(acc))
    
    acc_list.append(acc)

acc_list = np.array(acc_list)
print()
print('The test ccuracy for each training:\n{}'.format(acc_list))
print('The mean of the test accuracy: ', acc_list.mean())



loglikelihood:
                  0          1
word                          
simplist  -8.290324  -9.690078
silli     -6.952039  -8.080640
tediou    -7.730708  -9.284613
laddish  -10.369766 -11.076372
juvenil   -9.271153  -9.977760
logprior: [-0.69554714 -0.69075297]
Naive Bayes test accuracy = 0.7844

loglikelihood:
                  0          1
word                          
simplist  -8.296406  -9.465308
silli     -7.043643  -8.184374
tediou    -7.850119  -9.282987
laddish  -10.375848 -11.074746
juvenil   -9.277235 -10.381599
logprior: [-0.69012933 -0.69617416]
Naive Bayes test accuracy = 0.7919

loglikelihood:
                 0         1
word                        
simplist -8.425548 -9.976227
silli    -6.953731 -8.184467
tediou   -7.806509 -9.465401
exploit  -7.663408 -8.772254
larg     -7.886551 -8.030317
logprior: [-0.69356411 -0.69273043]
Naive Bayes test accuracy = 0.7580

loglikelihood:
                  0          1
word                          
simplist  -8.505058  -9.4