## Importing the libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import clean_text as ct
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

## Importing the dataset

In [5]:
X_train = pd.read_csv('dataset/imdb_trainX.txt',sep = '\n',header = None).values
Y_train = pd.read_csv('dataset/imdb_trainY.txt',sep = '\n',header = None).values.reshape((-1,))
X_test = pd.read_csv('dataset/imdb_testX.txt',sep = '\n',header = None).values
Y_test = pd.read_csv('dataset/imdb_testY.txt',sep = '\n',header = None).values.reshape((-1,))

In [6]:
print(type(X_train))
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

<class 'numpy.ndarray'>
(25000, 1)
(25000,)
(25000, 1)
(25000,)


In [7]:
print(X_train[0])
print(X_train[1])
print(X_train[2])

["I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."]
["First things first, Edison Chen did a fantastic, believable job as a Cambodian hit-man, born and bred in the dumps and a gladiatorial ring, where he honed his craft of savage battery in order to survive, living on the mantra of kill or be killed. In a role that had little dialogue, or at least a few lines in Cambodian/Thai, his performance is compelling, probably what should have been in the Jet Li vehicle Danny the Dog, where a man is bred for the sole purpose of fighting, and on someone else's leash.<br /><br />Like Danny the Dog, the much talked about bare knuckle fight sequences are not choreographed stylistically, but rather designed as normal, brutal fisticuffs, where every

In [8]:
print(Y_train[:3])

[10  8  7]


In [9]:
print(X_test[0])
print(X_test[1])
print(X_test[2])

["not really sure what to make of this movie. very weird, very artsy. not the kind of movie you watch because it has a compelling plot or characters. more like the kind of movie that you can't stop watching because of the horrifically fascinating things happening on screen. although, the first time my wife watched this she couldn't make it all the way through... too disturbing for her. runs a bit long, but nonetheless a worthwhile viewing for those interested in very dark movies."]
["If you enjoyed films like Pulp Fiction, Reservoir Dogs, and Lock, Stock, and Two Smoking Barrels, you are going to LOVE Two Hands. It has the same type of black humor beat to it and will keep you entertained through the whole film. Like Pulp Fiction, it has the wacky scenarios that the characters get into and how they deal with them. Along with Gallipoli and Picnic at Hanging Rock, this has to be one of the best Australian films I've seen. It also stars a young Heath Ledger before he got real big in the st

In [10]:
print(Y_test[:3])

[ 7 10 10]


## 1. Cleaning the dataset

In [11]:
X_train_clean = [ct.getCleanReview(review) for review in X_train]
X_test_clean = [ct.getCleanReview(review) for review in X_test]

In [12]:
'''
for review in X_train:
    review = str(review)
    print(type(review))
    print(review)
    break
'''

'\nfor review in X_train:\n    review = str(review)\n    print(type(review))\n    print(review)\n    break\n'

In [13]:
print(type(X_train_clean))

<class 'list'>


In [14]:
for i in range(3):
    print(X_train_clean[i],end='\n\n')

love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag

first thing first edison chen fantast believ job cambodian hit man born bred dump gladiatori ring hone craft savag batteri order surviv live mantra kill kill role littl dialogu least line cambodian thai perform compel probabl jet li vehicl danni dog man bred sole purpos fight someon els leash like danni dog much talk bare knuckl fight sequenc not choreograph stylist rather design normal brutal fisticuff everyth goe probabl brought sens realism grit see charact slug throat defend live take away other grim gritti dark movi liter figur set apart usual run mill cop thriller product edison play hire gun cambodia becom fugit hong kong run cop pickup gone awri lead chase team led cheung siu fai contend maverick member inspector ti sam lee inclus accept team sin father begin cat mous game dark shade shadow seedier look side hong ko

In [15]:
for i in range(3):
    print(X_test_clean[i],end='\n\n')

not realli sure make movi veri weird veri artsi not kind movi watch compel plot charact like kind movi stop watch horrif fascin thing happen screen although first time wife watch make way disturb run bit long nonetheless worthwhil view interest veri dark movi

enjoy film like pulp fiction reservoir dog lock stock two smoke barrel go love two hand type black humor beat keep entertain whole film like pulp fiction wacki scenario charact get deal along gallipoli picnic hang rock one best australian film seen also star young heath ledger got real big state terribl underr movi believ good pulp fiction great see

okay deal american pilot fli along mind busi suddenli outnumb evil cowardli non american fighter plane middl eastern type suffic say like appl pie elvi presley proceed shoot american pilot noth wrong evil non american care know bang foreign jail sentenc death would normal happen us militari would carpet bomb coupl nearbi town pilot releas not time evil peac lovin type probabl got inv

## 2. Vectorization

In [50]:
cv = CountVectorizer(binary=True) # Bag of words model

X_train_vec = cv.fit_transform(X_train_clean).toarray()
print(X_train_vec)
print(X_train_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(25000, 10000)


In [51]:
## Vectorization on the test set
X_test_vec = cv.transform(X_test_clean).toarray()
print(X_test_vec)
print(X_test_vec.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(25000, 10000)


## 3. Multinomial Naive Bayes

In [52]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [53]:
# Training
mnb.fit(X_train_vec,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
# Prediction
y_test_pred = mnb.predict(X_test_vec)

In [55]:
# Accuracy
print(mnb.score(X_train_vec,Y_train))
print(mnb.score(X_test_vec,Y_test))

0.60716
0.37428
