#### Importing the training dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('movie_Train.csv')
data.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [3]:
data.shape

(40000, 2)

In [4]:
data.isnull().sum()

review    0
label     0
dtype: int64

In [5]:
data['label'].value_counts()

pos    20011
neg    19989
Name: label, dtype: int64

#### Importing the test data

In [6]:
test_data = pd.read_csv('movie_Test.csv')
test_data.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [8]:
test_data.shape

(10000, 1)

In [9]:
x_test = test_data['review'].tolist()
x_test[0]

'Remember those old kung fu movies we used to watch on Friday and Saturday late nights when our babysitters THOUGHT we were in charge? Well, this movie plays exactly like one of those movies. Patsy Kensit\'s biggest claim to fame was the love interest to Mel Gibson\'s character in "Lethal Weapon 2," and this performance was one of the reasons why she\'s never made it big: she\'s a terrible actress.<br /><br />In "Lethal Weapon 2," I thought she was cute. Cute enough to check out some of the other movies she\'d been in, including "Loves Music, Loves to Dance" another big let down, which I, obviously, was not impressed with, either. But, as attractive as she is to my eyes, my soul screamed at me to turn it off because she played another cheap, predictable role, and done it very badly.<br /><br />In this movie, Kensit stars as a comedienne (and not a good one, either) who\'s working the clubs of France (couldn\'t cut it in her own homeland, so she\'s making THEIR ears bleed), who\'s down 

In [10]:
print(type(x_test))
print(len(x_test))

<class 'list'>
10000


#### Data PreProcessing and Cleaning

In [11]:
x_train = data.review.tolist()
y_train = data.label

In [12]:
print(len(x_train))
print(len(y_train))

40000
40000


In [13]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [14]:
ps = PorterStemmer()
wn = WordNetLemmatizer()
sw = set(stopwords.words('english'))

In [15]:
def cleanReview(review):
    
    review = review.lower()
    review = review.replace('<br /><br />' , ' ')
    review = re.sub('[^a-zA-Z]' , ' ' , review)
    
    tokenized_review = word_tokenize(review)
    cleaned_review = ' '.join([ps.stem(word) for word in tokenized_review if word not in sw])
    
    return cleaned_review

In [16]:
cleanReview(x_test[0])

'rememb old kung fu movi use watch friday saturday late night babysitt thought charg well movi play exactli like one movi patsi kensit biggest claim fame love interest mel gibson charact lethal weapon perform one reason never made big terribl actress lethal weapon thought cute cute enough check movi includ love music love danc anoth big let obvious impress either attract eye soul scream turn play anoth cheap predict role done badli movi kensit star comedienn good one either work club franc cut homeland make ear bleed luck even wors french govern want throw expir visa mayb caught act get marri casanova freiss luck predict begin terribl way give movi neg rate star rate'

In [17]:
x_test[0]

'Remember those old kung fu movies we used to watch on Friday and Saturday late nights when our babysitters THOUGHT we were in charge? Well, this movie plays exactly like one of those movies. Patsy Kensit\'s biggest claim to fame was the love interest to Mel Gibson\'s character in "Lethal Weapon 2," and this performance was one of the reasons why she\'s never made it big: she\'s a terrible actress.<br /><br />In "Lethal Weapon 2," I thought she was cute. Cute enough to check out some of the other movies she\'d been in, including "Loves Music, Loves to Dance" another big let down, which I, obviously, was not impressed with, either. But, as attractive as she is to my eyes, my soul screamed at me to turn it off because she played another cheap, predictable role, and done it very badly.<br /><br />In this movie, Kensit stars as a comedienne (and not a good one, either) who\'s working the clubs of France (couldn\'t cut it in her own homeland, so she\'s making THEIR ears bleed), who\'s down 

In [18]:
x_train = [cleanReview(review) for review in x_train]
x_test = [cleanReview(review) for review in x_test]

In [19]:
print(len(x_train))
print(len(x_test))

40000
10000


#### Building vocab and Vectorisation

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(ngram_range=(1,2) , max_features = 1787840)

In [22]:
x_vect = cv.fit_transform(x_train)
x_vect.shape

(40000, 1787840)

In [23]:
len(cv.get_feature_names())

1787840

In [24]:
x_test_vect = cv.transform(x_test)

In [25]:
print(x_test_vect.shape)

(10000, 1787840)


#### MultiNomial Naive Bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB()


In [28]:
mnb.fit(x_vect , y_train)

MultinomialNB()

In [30]:
predictions = mnb.predict(x_test_vect)

In [31]:
predictions

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [33]:
df = pd.DataFrame(predictions , columns = ['label'])
df.head()

Unnamed: 0,label
0,neg
1,neg
2,neg
3,pos
4,pos


In [34]:
df.to_csv('Movie-Rating.csv' , index_label = 'Id')

#### Using Multivariate Bernoulli Event Model Naive Bayes

In [36]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

In [37]:
bnb.fit(x_vec , y_train)

BernoulliNB()

In [38]:
prediction = bnb.predict(x_test_vec)
prediction

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [40]:
df = pd.DataFrame(prediction , columns = ['label'])
df.to_csv('Movie-Rating-2.csv' , index_label = 'Id')