In [46]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("../input"))

['testData.tsv', 'sampleSubmission.csv', 'labeledTrainData.tsv', 'unlabeledTrainData.tsv']


In [47]:
train = pd.read_csv("../input/labeledTrainData.tsv", header = 0, delimiter = '\t')
test = pd.read_csv("../input/testData.tsv", header = 0, delimiter = '\t')

In [48]:
print("Train set: ", train.shape, "Test set: ", test.shape)

Train set:  (25000, 3) Test set:  (25000, 2)


In [49]:
train['length'] = train['review'].apply(len)
train.head()

Unnamed: 0,id,sentiment,review,length
0,5814_8,1,With all this stuff going down at the moment w...,2302
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",946
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,2449
3,3630_4,0,It must be assumed that those who praised this...,2245
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,2231


## Sentiment: 
1 - Positive            0 - Negative

In [50]:
train.groupby('sentiment').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,12500.0,1305.72192,959.142634,52.0,711.0,978.0,1569.25,8999.0
1,12500.0,1349.6992,1048.890394,70.0,695.0,984.0,1653.0,13708.0


## No strong correlation between sentiment and length

In [51]:
train.drop(['length'], axis=1, inplace=True)

In [52]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [53]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [54]:
# Initialize the BeautifulSoup object   
bs_review1 = BeautifulSoup(train["review"][0])

alph_only = re.sub("[^a-zA-Z]", " ", bs_review1.get_text())  #Replace all non-alphabetical letters
words = alph_only.lower().split()
words = [w for w in words if not w in stopwords.words("english")] #Remove "stop words"
print(words)

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

In [55]:
def review_to_words(reviews):
    #Remove HTML
    reviews = BeautifulSoup(reviews).get_text() 
    
    #Remove non-alphabetical letters        
    alpha_only = re.sub("[^a-zA-Z]", " ", reviews) 
    
    #Convert to lower case and split into individual words
    words = alpha_only.lower().split()                             
    
    #Remove "stop words"
    words = [w for w in words if not w in stopwords.words("english")]   

    #Join words separate by spaces
    return(' '.join(words))

In [56]:
review = review_to_words(train["review"][0])
print(review)

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [57]:
#Modify all reviews using the above approach
train['review'] = train['review'].apply(review_to_words)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
#Create word count matrices
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
train_features = vectorizer.fit_transform(train['review'])

In [59]:
train_features = train_features.toarray()

In [60]:
train_features.shape

(25000, 5000)

In [61]:
vocab = vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word
dist = np.sum(train_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print(tag, ": ", count)

abandoned :  187
abc :  125
abilities :  108
ability :  454
able :  1259
abraham :  85
absence :  116
absent :  83
absolute :  352
absolutely :  1485
absurd :  306
abuse :  192
abusive :  91
abysmal :  98
academy :  297
accent :  485
accents :  203
accept :  300
acceptable :  130
accepted :  144
access :  92
accident :  318
accidentally :  200
accompanied :  88
accomplished :  124
according :  296
account :  186
accuracy :  81
accurate :  284
accused :  123
achieve :  179
achieved :  139
achievement :  124
acid :  90
across :  971
act :  1251
acted :  658
acting :  6490
action :  3354
actions :  311
activities :  83
actor :  2389
actors :  4486
actress :  1219
actresses :  369
acts :  394
actual :  793
actually :  4237
ad :  148
adam :  302
adams :  98
adaptation :  453
adaptations :  80
adapted :  154
add :  810
added :  439
adding :  166
addition :  347
adds :  337
adequate :  113
admire :  124
admit :  621
admittedly :  134
adorable :  101
adult :  510
adults :  376
advance :  100
a

centers :  91
central :  411
century :  528
certain :  764
certainly :  1462
cg :  96
cgi :  325
chain :  122
chair :  157
challenge :  165
challenging :  88
championship :  81
chan :  207
chance :  1067
chances :  133
change :  959
changed :  484
changes :  386
changing :  194
channel :  442
channels :  86
chaos :  105
chaplin :  150
chapter :  88
character :  7023
characterization :  123
characters :  7154
charge :  168
charisma :  138
charismatic :  135
charles :  408
charlie :  439
charlotte :  98
charm :  407
charming :  471
chase :  438
chased :  98
chases :  143
chasing :  145
che :  217
cheap :  892
cheated :  92
cheating :  103
check :  762
checked :  80
checking :  139
cheek :  114
cheese :  158
cheesy :  634
chemistry :  490
chess :  96
chest :  93
chicago :  92
chick :  233
chicken :  80
chicks :  89
chief :  229
child :  1320
childhood :  356
childish :  117
children :  1510
chilling :  169
china :  188
chinese :  337
choice :  528
choices :  171
choose :  227
chooses :  8

ears :  99
earth :  928
ease :  110
easier :  132
easily :  892
east :  170
eastern :  83
eastwood :  138
easy :  802
eat :  275
eaten :  90
eating :  278
eccentric :  109
ed :  341
eddie :  310
edgar :  95
edge :  441
edgy :  82
edie :  107
edited :  262
editing :  774
edition :  89
editor :  119
education :  97
educational :  83
edward :  204
eerie :  141
effect :  633
effective :  512
effectively :  187
effects :  2204
effort :  792
efforts :  254
ego :  128
eight :  221
eighties :  101
either :  1866
elaborate :  118
elderly :  119
elegant :  93
element :  392
elements :  783
elephant :  99
elizabeth :  175
ellen :  122
elm :  84
else :  1998
elsewhere :  139
elvira :  153
elvis :  154
em :  158
embarrassed :  163
embarrassing :  226
embarrassment :  96
emily :  122
emma :  202
emotion :  396
emotional :  657
emotionally :  241
emotions :  389
empathy :  84
emperor :  97
emphasis :  101
empire :  124
empty :  274
en :  90
encounter :  175
encounters :  140
end :  5648
endearing :  

hatred :  121
haunted :  217
haunting :  229
hawke :  79
hbo :  108
head :  1541
headed :  169
heads :  291
health :  137
hear :  733
heard :  1111
hearing :  231
heart :  1328
hearted :  225
hearts :  135
heat :  128
heaven :  320
heavily :  180
heavy :  492
heck :  222
heights :  85
held :  391
helen :  152
helicopter :  97
hell :  1025
hello :  90
help :  1895
helped :  324
helping :  176
helps :  360
hence :  155
henry :  407
hero :  1056
heroes :  318
heroic :  115
heroine :  291
heston :  136
hey :  409
hidden :  342
hide :  210
hideous :  103
hiding :  144
high :  2161
higher :  289
highest :  106
highlight :  202
highlights :  125
highly :  1147
hilarious :  973
hilariously :  86
hill :  243
hills :  152
hint :  147
hints :  103
hip :  181
hippie :  84
hire :  130
hired :  188
historical :  407
historically :  86
history :  1332
hit :  1088
hitchcock :  209
hitler :  305
hits :  272
hitting :  137
ho :  126
hoffman :  188
hold :  545
holding :  209
holds :  300
hole :  167
hole

minutes :  2952
miracle :  94
mirror :  168
miscast :  143
miserable :  100
miserably :  124
misery :  92
miss :  883
missed :  565
misses :  118
missing :  594
mission :  265
mistake :  426
mistaken :  108
mistakes :  200
mistress :  88
mitchell :  125
mix :  367
mixed :  287
mixture :  103
miyazaki :  83
mm :  83
mob :  157
model :  240
models :  105
modern :  929
modesty :  122
molly :  89
mom :  367
moment :  1112
moments :  1663
mon :  84
money :  2362
monk :  85
monkey :  132
monkeys :  98
monster :  655
monsters :  277
montage :  103
montana :  95
month :  148
months :  272
mood :  432
moody :  98
moon :  296
moore :  227
moral :  366
morality :  122
morgan :  275
morning :  266
moronic :  83
morris :  98
mostly :  941
mother :  1524
motion :  449
motivation :  122
motivations :  95
motives :  103
mountain :  196
mountains :  110
mouse :  168
mouth :  332
move :  727
moved :  322
movement :  206
movements :  115
moves :  530
movie :  44031
movies :  7663
moving :  854
mr :  1448

reduced :  118
reed :  163
reel :  88
reference :  166
references :  249
reflect :  92
reflection :  81
refreshing :  206
refused :  78
refuses :  154
regard :  166
regarding :  174
regardless :  125
regret :  189
regular :  266
reid :  86
relate :  235
related :  202
relation :  94
relations :  102
relationship :  966
relationships :  361
relative :  126
relatively :  213
relatives :  89
release :  807
released :  986
relevant :  132
relief :  242
relies :  106
religion :  238
religious :  310
remain :  209
remained :  78
remaining :  120
remains :  439
remake :  583
remarkable :  309
remarkably :  105
remarks :  91
remember :  1702
remembered :  258
remind :  157
reminded :  347
reminds :  297
reminiscent :  175
remote :  163
remotely :  189
removed :  108
rendition :  90
rent :  719
rental :  214
rented :  337
renting :  177
repeat :  143
repeated :  204
repeatedly :  119
repetitive :  123
replaced :  162
report :  96
reporter :  213
represent :  103
represented :  99
represents :  

surreal :  208
surrounded :  134
surrounding :  134
survival :  106
survive :  260
survived :  91
surviving :  104
survivor :  85
survivors :  101
susan :  186
suspect :  301
suspects :  156
suspend :  84
suspense :  739
suspenseful :  192
suspicious :  88
sutherland :  160
swear :  113
swedish :  112
sweet :  572
swim :  83
swimming :  97
switch :  97
sword :  195
symbolism :  114
sympathetic :  229
sympathy :  199
synopsis :  111
system :  370
table :  181
tacky :  79
tad :  97
tag :  99
take :  3507
taken :  986
takes :  2192
taking :  955
tale :  790
talent :  933
talented :  586
talents :  268
tales :  166
talk :  842
talked :  126
talking :  954
talks :  220
tall :  121
tame :  111
tank :  80
tap :  115
tape :  234
tarantino :  82
target :  211
tarzan :  291
task :  174
taste :  437
taught :  95
taxi :  91
taylor :  315
tea :  136
teach :  137
teacher :  314
teaching :  82
team :  823
tear :  138
tears :  324
tech :  82
technical :  305
technically :  195
technicolor :  80
techni

## Modelling

In [78]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(train_features, train['sentiment'], test_size=0.3, random_state=42)

In [82]:
model = MultinomialNB().fit(X_train, y_train)

In [83]:
pre = model.predict(X_val)

In [84]:
print(accuracy_score(pre, y_val))

0.8510666666666666


## Final Prediction

In [87]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [88]:
#Preparing final input data
test_reviews = test['review'].apply(review_to_words)
test_reviews = vectorizer.fit_transform(test_reviews)
test_reviews = test_reviews.toarray()

result = model.predict(test_reviews)

ids = test['id']

result_df = pd.DataFrame({'id':ids, 'sentiments':result})

In [91]:
result_df.to_csv( "result.csv", index=False, quoting=3 )