In [1]:
import pandas as pd

# Import Files

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
df.shape

(40000, 2)

In [5]:
df['label'].value_counts()

pos    20011
neg    19989
Name: label, dtype: int64

In [6]:
x_train = df['review']

In [7]:
y_train = df['label']

In [8]:
df_test = pd.read_csv('test.csv')

In [9]:
x_test = list(df_test.review)

In [10]:
x_train

0        mature intelligent and highly charged melodram...
1        http://video.google.com/videoplay?docid=211772...
2        Title: Opera (1987) Director: Dario Argento Ca...
3        I think a lot of people just wrote this off as...
4        This is a story of two dogs and a cat looking ...
                               ...                        
39995    There are similarities between Ray Lawrence's ...
39996    For starters, I once met the director when he ...
39997    Much of "Over Her Dead Body" is so painfully u...
39998    "Lifeforce" is a truly bizarre adaptation of t...
39999    I saw this movie at a screener and its the bes...
Name: review, Length: 40000, dtype: object

In [11]:
x_train = list(x_train)

In [12]:
x_test

['Remember those old kung fu movies we used to watch on Friday and Saturday late nights when our babysitters THOUGHT we were in charge? Well, this movie plays exactly like one of those movies. Patsy Kensit\'s biggest claim to fame was the love interest to Mel Gibson\'s character in "Lethal Weapon 2," and this performance was one of the reasons why she\'s never made it big: she\'s a terrible actress.<br /><br />In "Lethal Weapon 2," I thought she was cute. Cute enough to check out some of the other movies she\'d been in, including "Loves Music, Loves to Dance" another big let down, which I, obviously, was not impressed with, either. But, as attractive as she is to my eyes, my soul screamed at me to turn it off because she played another cheap, predictable role, and done it very badly.<br /><br />In this movie, Kensit stars as a comedienne (and not a good one, either) who\'s working the clubs of France (couldn\'t cut it in her own homeland, so she\'s making THEIR ears bleed), who\'s down

# Cleaning

In [13]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [14]:
# Init Objects
en_stopwords = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()

In [15]:
def getCleanReview(review):
    review = review.lower()
    review = review.replace('<br /><br />',' ')
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review
    

In [16]:
getCleanReview("mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it.")

'matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'

In [17]:
x_train = [getCleanReview(review) for review in x_train]

In [18]:
x_train

['matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take',
 'http video googl com videoplay docid 211772166650071408 hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule',
 'titl opera 1987 director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first mov

In [19]:
x_test = [getCleanReview(review) for review in x_test]

In [20]:
x_test

['rememb old kung fu movi use watch friday saturday late night babysitt thought charg well movi play exactli like one movi patsi kensit biggest claim fame love interest mel gibson charact lethal weapon 2 perform one reason never made big terribl actress lethal weapon 2 thought cute cute enough check movi includ love music love danc anoth big let obvious impress either attract eye soul scream turn play anoth cheap predict role done badli movi kensit star comedienn good one either work club franc cut homeland make ear bleed luck even wors french govern want throw expir visa mayb caught act get marri casanova freiss luck predict begin terribl way give movi neg rate 1 10 star rate',
 'movi anoth one list movi bother saw 40 year ago adolesc stay late annoy find 95 romanc 4 everyth els 1 histori call bait switch movi one interest titl actual movi scam subject deserv good cinemat treatment movi almost insult serv actual member lafayett escadril run law product abus home realiti idealist want 

In [21]:
print(len(x_train))
print(len(x_test))

40000
10000


# Vectorization

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
cv = CountVectorizer(ngram_range=(1,2))
x_vec = cv.fit_transform(x_train)

In [24]:
print(x_vec)

  (0, 1248488)	1
  (0, 1029431)	1
  (0, 941511)	1
  (0, 339466)	1
  (0, 1263578)	1
  (0, 2099263)	1
  (0, 743520)	2
  (0, 353383)	1
  (0, 7162)	1
  (0, 2191716)	2
  (0, 1932922)	2
  (0, 1477767)	1
  (0, 320321)	1
  (0, 1194723)	1
  (0, 2072109)	1
  (0, 1819408)	1
  (0, 1423833)	1
  (0, 1752554)	1
  (0, 1213598)	1
  (0, 1973513)	1
  (0, 1248604)	1
  (0, 1029728)	1
  (0, 941565)	1
  (0, 339670)	1
  (0, 1263816)	1
  :	:
  (39999, 1813539)	1
  (39999, 1813544)	1
  (39999, 1551937)	1
  (39999, 912814)	1
  (39999, 2211394)	1
  (39999, 908913)	1
  (39999, 1446628)	1
  (39999, 1476321)	1
  (39999, 854818)	1
  (39999, 464350)	1
  (39999, 1762344)	1
  (39999, 1187352)	1
  (39999, 1325729)	1
  (39999, 1742460)	1
  (39999, 796784)	1
  (39999, 1863717)	1
  (39999, 967952)	1
  (39999, 1283096)	1
  (39999, 1510213)	1
  (39999, 1367643)	1
  (39999, 958793)	1
  (39999, 1707148)	1
  (39999, 803296)	1
  (39999, 464364)	1
  (39999, 1862858)	1


In [25]:
x_vec.shape

(40000, 2270363)

In [26]:
print(len(cv.get_feature_names()))

2270363


In [27]:
x_test_vec = cv.transform(x_test)

In [28]:
x_test_vec.shape

(10000, 2270363)

# Multinomial Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

In [30]:
mnb = MultinomialNB()

In [31]:
mnb.fit(x_vec, y_train)

MultinomialNB()

In [32]:
pred = mnb.predict(x_test_vec)

In [33]:
pred

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [34]:
mnb.predict_proba(x_test_vec)

array([[1.00000000e+00, 3.55972836e-15],
       [9.99927183e-01, 7.28173795e-05],
       [1.00000000e+00, 3.70712389e-35],
       ...,
       [1.14834531e-26, 1.00000000e+00],
       [3.20368799e-13, 1.00000000e+00],
       [1.00000000e+00, 1.44002209e-43]])

In [35]:
Id = np.arange(10000)

In [36]:
f = pd.DataFrame(pred, columns=['label'])

In [37]:
f['Id'] = np.arange(10000)

In [38]:
f = f[['Id', 'label']]

In [39]:
f.to_csv('MovieReview.csv', index=False)

# Multivarient Bernoulli Event Model Naive Bayes

In [40]:
from sklearn.naive_bayes import BernoulliNB

In [41]:
bnb = BernoulliNB()

In [42]:
bnb.fit(x_vec, y_train)

BernoulliNB()

In [43]:
bnb.predict_proba(x_test_vec)

array([[1.00000000e+00, 5.35610832e-14],
       [9.99989907e-01, 1.00928668e-05],
       [1.00000000e+00, 5.36571705e-34],
       ...,
       [3.62523096e-20, 1.00000000e+00],
       [2.65968120e-10, 1.00000000e+00],
       [1.00000000e+00, 4.52581913e-37]])

In [44]:
pred2 = bnb.predict(x_test_vec)

In [45]:
pred2

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [46]:
f2 = pd.DataFrame(pred2, columns=['label'], index=Id)

In [47]:
f2.to_csv('MovieReview2.csv', index_label='Id')

# Decision Trees

In [48]:
from sklearn.tree import DecisionTreeClassifier

In [49]:
clf = DecisionTreeClassifier()

# Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(x_vec, y_train)