In [12]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from os.path import join
from tqdm import tqdm

In [4]:
src= 'data'
df = pd.read_csv(join(src, 'labeledTrainData.tsv'), delimiter='\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [15]:
from utils import clean_sentence
clean_sentence(df['review'][0])

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighty maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle message mj feeling towards press also obvious message drug bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fan would say made fan true really nice actual feature film bit finally start minute excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord want mj dead bad beyond mj overheard plan nah joe pesci character ranted wanted people know supplying drug etc dunno maybe hate mj music lot cool thing like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually director hate working one kid let a

In [7]:
df['review_cleaned'] = [clean_sentence(s) for s in tqdm(df['review'])]
df.head()

100%|████████████████████████████████████████████████████████████████████████████| 25000/25000 [20:28<00:00, 20.35it/s]


Unnamed: 0,id,sentiment,review,review_cleaned
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war world timothy hines entertaining f...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film start manager nicholas bell giving welcom...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


In [16]:
df.to_csv(join(src, 'cleanedTrainData.csv'), sep='\t', index=False, encoding='UTF-8')

In [10]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
train_features = vectorizer.fit_transform(df['review_cleaned'])
train_features = train_features.toarray()  # from scipy.sparse.csr.csr_matrix
train_features.shape

(25000, 5000)

In [11]:
vocab = vectorizer.get_feature_names()
vocab

['abandoned',
 'abc',
 'ability',
 'able',
 'abraham',
 'abrupt',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absurd',
 'absurdity',
 'abuse',
 'abused',
 'abusive',
 'abysmal',
 'academy',
 'accent',
 'accept',
 'acceptable',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidentally',
 'acclaimed',
 'accompanied',
 'accomplish',
 'accomplished',
 'according',
 'account',
 'accuracy',
 'accurate',
 'accused',
 'ace',
 'achieve',
 'achieved',
 'achievement',
 'acid',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'active',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'adapted',
 'add',
 'added',
 'addict',
 'addicted',
 'addiction',
 'adding',
 'addition',
 'additional',
 'address',
 'adequate',
 'admirable',
 'admire',
 'admit',
 'admittedly',
 'adolescent',
 'adopted',
 'adorable',
 'adult',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'advertising',
 'advice',
 'advise',
 'affair',
 'affect',
 'affected',
 'af

In [13]:
dist = np.sum(train_features, axis=0)
for tag, count in zip(vocab, dist):
    print(tag, count)

abandoned 187
abc 125
ability 562
able 1259
abraham 92
abrupt 73
absence 118
absent 83
absolute 352
absolutely 1485
absurd 306
absurdity 86
abuse 202
abused 77
abusive 91
abysmal 98
academy 298
accent 688
accept 300
acceptable 130
accepted 144
accepts 74
access 92
accident 337
accidentally 200
acclaimed 75
accompanied 88
accomplish 77
accomplished 124
according 296
account 243
accuracy 82
accurate 284
accused 123
ace 73
achieve 179
achieved 139
achievement 169
acid 91
across 971
act 1645
acted 658
acting 6491
action 3665
active 76
activity 146
actor 6875
actress 1588
actual 793
actually 4237
ad 188
adam 400
adaptation 533
adapted 154
add 1147
added 439
addict 82
addicted 78
addiction 72
adding 166
addition 371
additional 74
address 103
adequate 113
admirable 73
admire 124
admit 621
admittedly 134
adolescent 94
adopted 75
adorable 101
adult 886
advance 140
advanced 90
advantage 170
adventure 714
advertising 91
advice 262
advise 90
affair 417
affect 155
affected 113
affection 105
afford 

evidently 72
evil 1463
ex 468
exact 189
exactly 995
exaggerated 120
examination 80
example 1554
excellent 2070
except 1129
exception 471
exceptional 148
exceptionally 86
excess 96
excessive 87
exchange 105
excited 230
excitement 224
exciting 515
excuse 473
executed 241
execution 204
executive 183
exercise 140
exist 300
existed 114
existence 264
existent 161
exists 161
exotic 104
expect 1176
expectation 462
expected 704
expecting 588
expedition 82
expensive 140
experience 1259
experienced 192
experiment 260
experimental 79
expert 218
explain 451
explained 285
explaining 107
explains 193
explanation 327
explicit 119
exploit 120
exploitation 234
exploration 100
explore 118
explored 106
explosion 221
expose 88
exposed 117
exposition 79
exposure 88
express 227
expressed 83
expression 324
extended 115
extent 172
exterior 72
extra 543
extraordinary 173
extreme 382
extremely 1069
eye 2065
eyed 133
eyre 115
fabulous 178
face 1990
faced 204
facial 178
facing 96
fact 3747
factor 279
factory 151
f

straight 868
straightforward 76
strange 926
strangely 166
stranger 236
streep 103
street 968
streisand 152
strength 306
stress 109
stretch 180
stretched 83
strictly 129
strike 266
striking 137
string 191
strip 169
stroke 74
strong 1096
stronger 133
strongly 222
struck 128
structure 216
struggle 482
struggling 198
stuart 80
stuck 350
student 752
studio 698
study 320
stuff 1181
stumble 111
stumbled 78
stunned 82
stunning 405
stunt 275
stupid 1701
stupidity 166
style 1715
stylish 158
sub 385
subject 816
subjected 79
subplot 121
subplots 87
subsequent 121
substance 230
subtitle 207
subtle 434
subtlety 136
succeed 150
succeeded 106
succeeds 167
success 613
successful 524
successfully 161
suck 457
sucked 251
sudden 248
suddenly 538
sue 97
suffer 181
suffered 150
suffering 257
suffers 201
suffice 84
sugar 78
suggest 377
suggested 81
suggestion 76
suggests 145
suicide 331
suit 432
suitable 94
suited 111
sullivan 193
sum 235
summary 185
summer 388
sun 199
sunday 195
sung 77
sunny 77
sunshine 11