In [5]:
import pandas as pd
import numpy as np

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from sklearn.preprocessing import LabelEncoder

In [7]:
original_review = '"http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules."'


In [8]:
tokenizer = RegexpTokenizer('[a-z]+')
sw = set(stopwords.words('English'))
sw.add('br')
sw.remove('not')
sw.remove("didn't")
sw.remove("don't")
ls = LancasterStemmer()

In [9]:
def review_process(review):
    review = review.lower()
    review = tokenizer.tokenize(review)
    review = [x for x in review if x not in sw]
    review = [ls.stem(x) for x in review]
    review = ' '.join(review)
    return review

In [10]:
review = review_process(original_review)
review

'http video googl com videoplay docid hl en distribut tri opt mass ap want best poss view rang forgo profit continu man lab job glad entertain work view texa tal pleas writ lik not lik alex not lik stuy texa texa tal writ opin rul'

# Importing data

In [11]:
df = pd.read_csv('reviews/Train.csv')
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [12]:
doc = df.iloc[:,0].values

In [13]:
doc = [review_process(x) for x in doc]

In [14]:
y = df.iloc[:,1]
le = LabelEncoder()
le.fit(y)
y =le.transform(y)
y

array([1, 1, 1, ..., 0, 1, 1])

# Vectorization

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer()

In [17]:
vec_rev = cv.fit_transform(doc).toarray()

In [18]:
print(vec_rev)
print(vec_rev.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(40000, 51094)


In [19]:
# to get vector feature names
#print(cv.get_feature_names())

# Implementing Model

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
mnb = MultinomialNB()

In [22]:
mnb.fit(vec_rev,y)

In [33]:
test = pd.read_csv('reviews/Test/Test.csv')
test.head()
test.shape

(10000, 1)

In [24]:
test = test.iloc[:,0].values
test = [review_process(x) for x in test]
test = cv.transform(test)

In [31]:
y_test = mnb.predict(test)

In [26]:
y_test = le.inverse_transform(y_test)

In [27]:
final = pd.DataFrame(np.arange(10000),columns = ['Id'])

In [28]:
final['label'] = y_test

In [29]:
final.head()

Unnamed: 0,Id,label
0,0,neg
1,1,pos
2,2,neg
3,3,pos
4,4,pos


In [30]:
final.to_csv('Final.csv',index = False)