In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Data Preparation

In [2]:
IMDB_review_df = pd.read_csv("IMDB Dataset_V1.csv")
IMDB_review_df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [3]:
def text_clean(text_series):
    text_series = text_series.str.lower()
    clean_2 = text_series.str.replace(r"<.*>|[^a-zA-Z\s]","")
    clean_3 = clean_2.str.replace(r"\s+", " ")
    return clean_3

In [4]:
IMDB_review_df["Text_Clean"] = text_clean(IMDB_review_df["review"])

In [5]:
train_x, test_x, train_y, test_y = train_test_split(IMDB_review_df["Text_Clean"], 
                                                    IMDB_review_df["sentiment"], 
                                                    test_size = 0.3, 
                                                    random_state = 8)

# SkLearn Implementation - BOW

In [6]:
BOW_Model = CountVectorizer(stop_words="english")
BOW_train_arr = BOW_Model.fit_transform(train_x)
BOW_test_arr = BOW_Model.transform(test_x)

In [7]:
Alpha = 1
MNB_Model = MultinomialNB(alpha=Alpha)
MNB_Model.fit(BOW_train_arr, train_y)
prediction = MNB_Model.predict(BOW_test_arr)

In [8]:
pd.crosstab(prediction, test_y)

sentiment,negative,positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,365,105
positive,82,348


# SkLearn Implementation - Tf Idf

In [9]:
Tf_Idf_Model = TfidfVectorizer(stop_words="english")
Tf_Idf_train_arr = Tf_Idf_Model.fit_transform(train_x)
Tf_Idf_test_arr = Tf_Idf_Model.transform(test_x)

In [10]:
Alpha = 1
MNB_Model = MultinomialNB(alpha=Alpha)
MNB_Model.fit(Tf_Idf_train_arr, train_y)
predict_arr = MNB_Model.predict(Tf_Idf_test_arr)

In [11]:
pd.crosstab(predict_arr, test_y)

sentiment,negative,positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,374,115
positive,73,338


# Scratch Implementation - Tf Idf

In [12]:
n_samples,n_features = BOW_train_arr.shape
doc_count_arr = np.bincount(BOW_train_arr.indices)
doc_count_arr = doc_count_arr +1
n_samples = n_samples+1
idf_arr = np.log(n_samples/doc_count_arr)+1

In [15]:
tf_idf_train = np.multiply(BOW_train_arr.toarray(),idf_arr)
tf_idf_train = normalize(tf_idf_train)

In [16]:
tf_idf_test = np.multiply(BOW_test_arr.toarray(),idf_arr)
tf_idf_test = normalize(tf_idf_test)

# Improvement Tf Idf

In [23]:
lb_model = LabelBinarizer()
Y_arr = lb_model.fit_transform(train_y)
if Y_arr.shape[1] == 1:
    Y_arr = np.concatenate([1-Y_arr,Y_arr],axis = 1)
Y_arr.shape

(2100, 2)

In [29]:
BOW_Doc_Count_arr = BOW_train_arr.copy()
BOW_Doc_Count_arr[BOW_Doc_Count_arr>0] = 1
BOW_Doc_Count_arr.shape

(2100, 22087)

In [46]:
Cat_Doc_Count = np.dot(Y_arr.transpose(), BOW_Doc_Count_arr.toarray())
Cat_Total_Doc_Count = Y_arr.sum(axis=0)
Cat_Total_Doc_Count = Cat_Total_Doc_Count.reshape(-1,1)

In [49]:
idf_arr_train = np.log((Cat_Total_Doc_Count+1)/(Cat_Doc_Count+1))

In [61]:
improved_idf_dict = {}
for i in range(idf_arr_train.shape[0]):
    improved_idf_dict[i] = np.delete(idf_arr_train,i,0).sum(axis = 0)/idf_arr_train[i]
for i in range(idf_arr_train.shape[0]):
    idf_arr_train[i] = improved_idf_dict[i]
idf_arr_train_improved = np.dot(Y_arr,idf_arr_train)

In [67]:
tf_idf_arr_train = np.multiply(BOW_train_arr.toarray(), idf_arr_train_improved)
tf_idf_arr_train = normalize(tf_idf_arr_train)

In [70]:
lb_model = LabelBinarizer()
Y_arr_test = lb_model.fit_transform(test_y)
if Y_arr_test.shape[1] == 1:
    Y_arr_test = np.concatenate([1-Y_arr_test,Y_arr_test],axis = 1)
Y_arr_test.shape

(900, 2)

In [71]:
idf_arr_test_improved = np.dot(Y_arr_test,idf_arr_train)

In [72]:
tf_idf_arr_test = np.multiply(BOW_test_arr.toarray(), idf_arr_test_improved)
tf_idf_arr_test = normalize(tf_idf_arr_test)

In [78]:
lb_model = LabelBinarizer()
Y_arr = lb_model.fit_transform(train_y)
if Y_arr.shape[1] == 1:
    Y_arr = np.concatenate([1-Y_arr,Y_arr],axis = 1)
cat_count_arr = np.log(np.sum(Y_arr,axis = 0)/np.sum(Y_arr))
classes = lb_model.classes_
consolidated_train_df = np.dot(np.transpose(Y_arr),tf_idf_arr_train)
prob_table_numer = consolidated_train_df + 1
prob_table_denom = np.sum(prob_table_numer,axis=1)
prob_table = np.log(prob_table_numer) - np.log(prob_table_denom.reshape(-1,1))

In [79]:
predict_arr = classes[np.argmax(np.dot(tf_idf_arr_test,np.transpose(prob_table))+cat_count_arr,axis=1)]
pd.crosstab(predict_arr, test_y)

sentiment,negative,positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,415,52
positive,32,401
