In [41]:
# load data sources
import pandas as pd       
train = pd.read_csv("data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
print(train.head())
print(train.describe())
print(train.dtypes)

         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...
         sentiment
count  25000.00000
mean       0.50000
std        0.50001
min        0.00000
25%        0.00000
50%        0.50000
75%        1.00000
max        1.00000
id           object
sentiment     int64
review       object
dtype: object


In [42]:
# clean reviews text(remove html markup, number and punctuation signs)
from bs4 import BeautifulSoup
import re

pattern = '[^a-zA-Z ]'

train['review'] = train.apply(lambda row: re.sub( pattern, '', BeautifulSoup(row['review']).get_text().lower()), axis=1)
print(train.head())

         id  sentiment                                             review
0  "5814_8"          1  with all this stuff going down at the moment w...
1  "2381_9"          1  the classic war of the worlds by timothy hines...
2  "7759_3"          0  the film starts with a manager nicholas bell g...
3  "3630_4"          0  it must be assumed that those who praised this...
4  "9495_8"          1  superbly trashy and wondrously unpretentious s...


In [52]:
# split every review into words tokens
# remove stop words as connectors and pronouns

import nltk
# nltk.download()

from nltk.corpus import stopwords # Import the stop word list
english_stop_words = stopwords.words("english")

# apply stemmer to remove common words
from nltk.stem import PorterStemmer

ps = PorterStemmer()
def preprocess_reviews(reviews):
    processed_reviews = []
    num_reviews = len(reviews)
    for i in range (0, num_reviews):
        r = reviews[i]
        tokens = list(
            filter(
                (lambda w: w not in english_stop_words), 
                r.split(' ')
            )
        )
        processed_reviews.append(' '.join(tokens))
        
    return processed_reviews

processed_reviews = preprocess_reviews(train['review'].tolist())

In [50]:
# bag of words implementation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(processed_reviews)
train_data_features = train_data_features.toarray()
dist = np.sum(train_data_features, axis=0))

vocab = vectorizer.get_feature_names()

(25000, 5000)
184 abandoned
105 abc
105 abilities
445 ability
1240 able
79 abraham
115 absence
347 absolute
1475 absolutely
288 absurd
186 abuse
76 abused
89 abusive
91 abysmal
293 academy
458 accent
197 accents
297 accept
124 acceptable
141 accepted
91 access
299 accident
200 accidentally
86 accompanied
118 accomplished
277 according
179 account
79 accuracy
279 accurate
122 accused
172 achieve
138 achieved
118 achievement
81 acid
962 across
1190 act
560 acted
6256 acting
3055 action
295 actions
81 activities
2187 actor
4373 actors
1164 actress
325 actresses
383 acts
790 actual
4188 actually
119 ad
269 adam
117 adams
443 adaptation
78 adaptations
147 adapted
776 add
427 added
160 adding
347 addition
335 adds
106 adequate
124 admire
618 admit
122 admittedly
75 adopted
99 adorable
494 adult
355 adults
100 advance
89 advanced
147 advantage
443 adventure
197 adventures
88 advertising
252 advice
90 advise
329 affair
92 affect
113 affected
103 afford
132 aforementioned
334 afraid
200 africa


112 distance
117 distant
83 distinct
101 distracting
78 distribution
104 disturbed
458 disturbing
99 divorce
76 dixon
121 doc
547 doctor
115 doctors
119 documentaries
891 documentary
4561 doesnt
631 dog
209 dogs
98 doll
155 dollar
186 dollars
95 dolls
78 dolph
84 domestic
79 domino
183 donald
2924 done
95 donna
8328 dont
79 doom
102 doomed
390 door
125 doors
132 dorothy
75 dose
309 double
739 doubt
83 doubts
297 douglas
88 downhill
191 downright
184 dozen
103 dozens
646 dr
103 dracula
204 drag
136 dragged
167 dragon
129 drags
78 drake
1278 drama
128 dramas
635 dramatic
195 draw
110 drawing
378 drawn
118 draws
233 dreadful
592 dream
422 dreams
86 dreary
76 dreck
164 dress
270 dressed
75 dresses
220 drew
141 drink
167 drinking
356 drive
111 drivel
176 driven
175 driver
154 drives
261 driving
182 drop
129 dropped
100 drops
346 drug
305 drugs
262 drunk
118 drunken
215 dry
90 dub
208 dubbed
145 dubbing
173 dude
898 due
118 duke
780 dull
577 dumb
111 duo
82 dust
101 dutch
94 duty
2154 dvd
12

167 initially
190 inner
145 innocence
404 innocent
107 innovative
235 insane
587 inside
187 insight
152 inspector
164 inspiration
319 inspired
120 inspiring
119 installment
283 instance
100 instant
127 instantly
2108 instead
82 instinct
210 insult
118 insulting
79 integrity
164 intellectual
316 intelligence
522 intelligent
379 intended
335 intense
152 intensity
116 intent
132 intention
90 intentionally
151 intentions
97 interaction
76 interactions
983 interest
640 interested
3046 interesting
79 interests
257 international
148 internet
159 interpretation
173 interview
158 interviews
87 intimate
106 intrigue
118 intrigued
294 intriguing
83 introduce
308 introduced
99 introduces
165 introduction
84 invasion
79 invented
95 inventive
109 investigate
122 investigation
191 invisible
98 involve
1042 involved
109 involvement
224 involves
461 involving
81 iran
81 iraq
108 ireland
189 irish
91 iron
160 ironic
109 ironically
138 irony
81 irrelevant
219 irritating
507 island
3163 isnt
96 isolated
7

621 plenty
89 plight
6244 plot
254 plots
611 plus
82 poem
90 poetic
85 poetry
151 poignant
3069 point
135 pointed
478 pointless
789 points
88 pokemon
77 polanski
1069 police
83 polished
581 political
98 politically
201 politics
143 pool
1833 poor
686 poorly
266 pop
109 popcorn
87 pops
535 popular
80 popularity
105 population
332 porn
89 porno
104 portion
143 portrait
261 portray
498 portrayal
589 portrayed
226 portraying
229 portrays
175 position
505 positive
100 possessed
95 possibilities
106 possibility
939 possible
699 possibly
225 post
111 poster
81 pot
595 potential
91 potentially
118 poverty
182 powell
889 power
589 powerful
298 powers
232 practically
98 practice
162 praise
120 precious
76 precisely
824 predictable
172 prefer
171 pregnant
695 premise
168 prepared
86 prequel
402 presence
581 present
150 presentation
404 presented
76 presenting
204 presents
228 president
124 press
79 pressure
125 presumably
112 pretend
91 pretending
263 pretentious
3624 pretty
123 prevent
95 previe

136 tribute
156 trick
130 tricks
761 tried
1266 tries
204 trilogy
109 trio
465 trip
146 trite
111 triumph
89 troops
504 trouble
143 troubled
81 troubles
163 truck
2228 true
1721 truly
304 trust
667 truth
1795 try
2458 trying
136 tune
114 tunes
160 turkey
1295 turn
887 turned
113 turner
331 turning
1244 turns
2587 tv
104 twelve
226 twenty
373 twice
85 twilight
120 twin
80 twins
579 twist
192 twisted
422 twists
6587 two
997 type
237 types
768 typical
125 typically
337 ugly
225 uk
246 ultimate
491 ultimately
245 unable
82 unaware
107 unbearable
406 unbelievable
115 unbelievably
313 uncle
147 uncomfortable
178 unconvincing
167 underground
82 underlying
76 underneath
244 underrated
1627 understand
91 understandable
271 understanding
89 understated
172 understood
102 undoubtedly
107 uneven
245 unexpected
78 unexpectedly
77 unfair
98 unfolds
140 unforgettable
205 unfortunate
1231 unfortunately
266 unfunny
93 unhappy
120 uninspired
103 unintentional
132 unintentionally
193 uninteresting
121 un

In [51]:
# train random forrest classifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit( train_data_features, train["sentiment"] )

In [None]:
# Read the test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
# Create an empty list and append the clean reviews one by one
test_reviews = test['review'].tolist()
num_reviews = len(test["review"])
clean_test_reviews = preprocess_reviews(test_reviews).toarray()
result = forest.predict(clean_test_reviews)