In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer

In [2]:
df = pd.read_csv('Train/Train.csv')
df.head(n=10)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [3]:
data = df.values
x_data = data[:,0]
y_data = data[:,1]
del(data)

In [4]:
y_data

array(['pos', 'pos', 'pos', ..., 'neg', 'pos', 'pos'], dtype=object)

In [5]:
x_data.shape

(40000,)

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()
en_stopwords = set(stopwords.words('english'))
en_stopwords.remove('not')

In [7]:
def clean_text(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    tokens = tokenizer.tokenize(review)
    useful = [t for t in tokens if t not in en_stopwords]
    stemmed_tokens = [ps.stem(x) for x in useful]
    cleaned = ' '.join(stemmed_tokens)
    return cleaned

In [8]:
x_clean = [clean_text(x) for x in x_data]

In [9]:
cv = CountVectorizer(ngram_range=(1,2),max_df=0.5)

In [38]:
tfidf = TfidfTransformer()

In [11]:
x_vect = cv.fit_transform(x_clean,y_data)

In [39]:
x_tfidf = tfidf.fit_transform(x_vect)

In [40]:
print(x_clean[0])
print(x_tfidf[0])
print(x_tfidf.shape)

matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take
  (0, 2187159)	0.18184579634521478
  (0, 2187157)	0.18886920959102335
  (0, 2187147)	0.3219814757289457
  (0, 2095104)	0.18886920959102335
  (0, 2095103)	0.18886920959102335
  (0, 2068160)	0.18886920959102335
  (0, 2068073)	0.1173795308800621
  (0, 1969981)	0.04511437963596305
  (0, 1929890)	0.1489841336169357
  (0, 1929876)	0.18886920959102335
  (0, 1929607)	0.17519593297336655
  (0, 1817548)	0.15179926512715383
  (0, 1816563)	0.06469601195387824
  (0, 1752617)	0.164856001095949
  (0, 1749995)	0.03424179727701697
  (0, 1476819)	0.18886920959102335
  (0, 1476403)	0.04925380180036575
  (0, 1422696)	0.18886920959102335
  (0, 1422695)	0.18886920959102335
  (0, 1258120)	0.18886920959102335
  (0, 1257883)	0.1023081561888416
  (0, 1243134)	0.16716901430703932
  (0, 1243021)	0.09915976222568558
  (0, 1208367)	0.15179926512715383
  (0, 1208260)	

In [41]:
len(cv.get_feature_names())

2265303

In [43]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
mnb = MultinomialNB()

In [45]:
mnb.fit(x_tfidf,y_data)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
dft = pd.read_csv('Test/Test.csv')
dft.shape

(10000, 1)

In [25]:
X_test = dft.values[:,0]

In [26]:
print(type(X_test))

<class 'numpy.ndarray'>


In [27]:
xt_clean = [clean_text(x) for x in X_test]

In [28]:
xt_vect = cv.transform(xt_clean)

In [47]:
xt_tfidf = tfidf.transform(xt_vect)

In [48]:
predtfidf = mnb.predict(xt_tfidf)

In [49]:
predtfidf = predtfidf.reshape((10000,1))

In [50]:
df2 = pd.DataFrame(data=predtfidf,columns=["label"])

In [52]:
df2.to_csv('movie2.csv',index_label="id")