In [7]:
from __future__ import print_function
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from matplotlib import pyplot
from keras import backend as K
import pandas as pd
from sklearn.utils import shuffle


In [8]:

BASE_DIR = '/Volumes/My Passport for Mac/data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'imdb_movie_reviews')
MOVIE_REVIEW_FILE_NAME = "imdb_master.csv"
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2




In [9]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

In [10]:

# second, prepare text samples and their labels
print('Processing text dataset')
index_to_label_dict = {}
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

input_df = read_csv(os.path.join(TEXT_DATA_DIR, MOVIE_REVIEW_FILE_NAME))
review_df1 = input_df[['review','label']]
review_df = review_df1[review_df1['label']!='unsup']


review_df = shuffle(review_df)
print("Here are Few Samples in data")
print(review_df.head)

print("Here total number of posistive and negative samples")
print(review_df.groupby(['label']).count())

print("Converting pandas dataframe into lists")
texts = review_df['review'].values.tolist()
labels = []
labels_text = []
labels_text_unique = review_df.label.unique().tolist()
labels_text = review_df['label'].values.tolist()

idxCounter = 0
for label in labels_text_unique:
    labels_index[label] = idxCounter
    index_to_label_dict[idxCounter] = label
    idxCounter = idxCounter + 1;

idxCounter = 0    
for label in labels_text:
    print("processing row " + str(idxCounter))
    labels.append(labels_index[label])
    idxCounter = idxCounter + 1;
    

print("Labels Array")
print(len(labels))
print("Labels Dictionary")
print(labels_index)
print("Done")



Processing text dataset
Here are Few Samples in data
<bound method NDFrame.head of                                                   review label
4280   Ya know what? Family Guy started out as someth...   neg
1692   I really should have learned more about this m...   neg
4078   Awful film. Terrible acting, cheesy, totally u...   neg
11286  The film quickly gets to a major chase scene w...   neg
30868  Really it's a dreadful cheat of a film. Its 70...   neg
...                                                  ...   ...
26470  This movie will promote the improvement of the...   neg
14126  Night of the Demons is a great movie and an ex...   pos
17142  This was another great episode from season 11 ...   pos
37644  Stargate SG-1 is a spin off of sorts from the ...   pos
23594  I discovered this late one night on Turner Cla...   pos

[50000 rows x 2 columns]>
Here total number of posistive and negative samples
       review
label        
neg     25000
pos     25000
Converting pandas datafram

processing row 3491
processing row 3492
processing row 3493
processing row 3494
processing row 3495
processing row 3496
processing row 3497
processing row 3498
processing row 3499
processing row 3500
processing row 3501
processing row 3502
processing row 3503
processing row 3504
processing row 3505
processing row 3506
processing row 3507
processing row 3508
processing row 3509
processing row 3510
processing row 3511
processing row 3512
processing row 3513
processing row 3514
processing row 3515
processing row 3516
processing row 3517
processing row 3518
processing row 3519
processing row 3520
processing row 3521
processing row 3522
processing row 3523
processing row 3524
processing row 3525
processing row 3526
processing row 3527
processing row 3528
processing row 3529
processing row 3530
processing row 3531
processing row 3532
processing row 3533
processing row 3534
processing row 3535
processing row 3536
processing row 3537
processing row 3538
processing row 3539
processing row 3540


processing row 7411
processing row 7412
processing row 7413
processing row 7414
processing row 7415
processing row 7416
processing row 7417
processing row 7418
processing row 7419
processing row 7420
processing row 7421
processing row 7422
processing row 7423
processing row 7424
processing row 7425
processing row 7426
processing row 7427
processing row 7428
processing row 7429
processing row 7430
processing row 7431
processing row 7432
processing row 7433
processing row 7434
processing row 7435
processing row 7436
processing row 7437
processing row 7438
processing row 7439
processing row 7440
processing row 7441
processing row 7442
processing row 7443
processing row 7444
processing row 7445
processing row 7446
processing row 7447
processing row 7448
processing row 7449
processing row 7450
processing row 7451
processing row 7452
processing row 7453
processing row 7454
processing row 7455
processing row 7456
processing row 7457
processing row 7458
processing row 7459
processing row 7460


processing row 11176
processing row 11177
processing row 11178
processing row 11179
processing row 11180
processing row 11181
processing row 11182
processing row 11183
processing row 11184
processing row 11185
processing row 11186
processing row 11187
processing row 11188
processing row 11189
processing row 11190
processing row 11191
processing row 11192
processing row 11193
processing row 11194
processing row 11195
processing row 11196
processing row 11197
processing row 11198
processing row 11199
processing row 11200
processing row 11201
processing row 11202
processing row 11203
processing row 11204
processing row 11205
processing row 11206
processing row 11207
processing row 11208
processing row 11209
processing row 11210
processing row 11211
processing row 11212
processing row 11213
processing row 11214
processing row 11215
processing row 11216
processing row 11217
processing row 11218
processing row 11219
processing row 11220
processing row 11221
processing row 11222
processing ro

processing row 14978
processing row 14979
processing row 14980
processing row 14981
processing row 14982
processing row 14983
processing row 14984
processing row 14985
processing row 14986
processing row 14987
processing row 14988
processing row 14989
processing row 14990
processing row 14991
processing row 14992
processing row 14993
processing row 14994
processing row 14995
processing row 14996
processing row 14997
processing row 14998
processing row 14999
processing row 15000
processing row 15001
processing row 15002
processing row 15003
processing row 15004
processing row 15005
processing row 15006
processing row 15007
processing row 15008
processing row 15009
processing row 15010
processing row 15011
processing row 15012
processing row 15013
processing row 15014
processing row 15015
processing row 15016
processing row 15017
processing row 15018
processing row 15019
processing row 15020
processing row 15021
processing row 15022
processing row 15023
processing row 15024
processing ro

processing row 18877
processing row 18878
processing row 18879
processing row 18880
processing row 18881
processing row 18882
processing row 18883
processing row 18884
processing row 18885
processing row 18886
processing row 18887
processing row 18888
processing row 18889
processing row 18890
processing row 18891
processing row 18892
processing row 18893
processing row 18894
processing row 18895
processing row 18896
processing row 18897
processing row 18898
processing row 18899
processing row 18900
processing row 18901
processing row 18902
processing row 18903
processing row 18904
processing row 18905
processing row 18906
processing row 18907
processing row 18908
processing row 18909
processing row 18910
processing row 18911
processing row 18912
processing row 18913
processing row 18914
processing row 18915
processing row 18916
processing row 18917
processing row 18918
processing row 18919
processing row 18920
processing row 18921
processing row 18922
processing row 18923
processing ro

processing row 22649
processing row 22650
processing row 22651
processing row 22652
processing row 22653
processing row 22654
processing row 22655
processing row 22656
processing row 22657
processing row 22658
processing row 22659
processing row 22660
processing row 22661
processing row 22662
processing row 22663
processing row 22664
processing row 22665
processing row 22666
processing row 22667
processing row 22668
processing row 22669
processing row 22670
processing row 22671
processing row 22672
processing row 22673
processing row 22674
processing row 22675
processing row 22676
processing row 22677
processing row 22678
processing row 22679
processing row 22680
processing row 22681
processing row 22682
processing row 22683
processing row 22684
processing row 22685
processing row 22686
processing row 22687
processing row 22688
processing row 22689
processing row 22690
processing row 22691
processing row 22692
processing row 22693
processing row 22694
processing row 22695
processing ro

processing row 26436
processing row 26437
processing row 26438
processing row 26439
processing row 26440
processing row 26441
processing row 26442
processing row 26443
processing row 26444
processing row 26445
processing row 26446
processing row 26447
processing row 26448
processing row 26449
processing row 26450
processing row 26451
processing row 26452
processing row 26453
processing row 26454
processing row 26455
processing row 26456
processing row 26457
processing row 26458
processing row 26459
processing row 26460
processing row 26461
processing row 26462
processing row 26463
processing row 26464
processing row 26465
processing row 26466
processing row 26467
processing row 26468
processing row 26469
processing row 26470
processing row 26471
processing row 26472
processing row 26473
processing row 26474
processing row 26475
processing row 26476
processing row 26477
processing row 26478
processing row 26479
processing row 26480
processing row 26481
processing row 26482
processing ro

processing row 30461
processing row 30462
processing row 30463
processing row 30464
processing row 30465
processing row 30466
processing row 30467
processing row 30468
processing row 30469
processing row 30470
processing row 30471
processing row 30472
processing row 30473
processing row 30474
processing row 30475
processing row 30476
processing row 30477
processing row 30478
processing row 30479
processing row 30480
processing row 30481
processing row 30482
processing row 30483
processing row 30484
processing row 30485
processing row 30486
processing row 30487
processing row 30488
processing row 30489
processing row 30490
processing row 30491
processing row 30492
processing row 30493
processing row 30494
processing row 30495
processing row 30496
processing row 30497
processing row 30498
processing row 30499
processing row 30500
processing row 30501
processing row 30502
processing row 30503
processing row 30504
processing row 30505
processing row 30506
processing row 30507
processing ro

processing row 34267
processing row 34268
processing row 34269
processing row 34270
processing row 34271
processing row 34272
processing row 34273
processing row 34274
processing row 34275
processing row 34276
processing row 34277
processing row 34278
processing row 34279
processing row 34280
processing row 34281
processing row 34282
processing row 34283
processing row 34284
processing row 34285
processing row 34286
processing row 34287
processing row 34288
processing row 34289
processing row 34290
processing row 34291
processing row 34292
processing row 34293
processing row 34294
processing row 34295
processing row 34296
processing row 34297
processing row 34298
processing row 34299
processing row 34300
processing row 34301
processing row 34302
processing row 34303
processing row 34304
processing row 34305
processing row 34306
processing row 34307
processing row 34308
processing row 34309
processing row 34310
processing row 34311
processing row 34312
processing row 34313
processing ro

processing row 38165
processing row 38166
processing row 38167
processing row 38168
processing row 38169
processing row 38170
processing row 38171
processing row 38172
processing row 38173
processing row 38174
processing row 38175
processing row 38176
processing row 38177
processing row 38178
processing row 38179
processing row 38180
processing row 38181
processing row 38182
processing row 38183
processing row 38184
processing row 38185
processing row 38186
processing row 38187
processing row 38188
processing row 38189
processing row 38190
processing row 38191
processing row 38192
processing row 38193
processing row 38194
processing row 38195
processing row 38196
processing row 38197
processing row 38198
processing row 38199
processing row 38200
processing row 38201
processing row 38202
processing row 38203
processing row 38204
processing row 38205
processing row 38206
processing row 38207
processing row 38208
processing row 38209
processing row 38210
processing row 38211
processing ro

processing row 41916
processing row 41917
processing row 41918
processing row 41919
processing row 41920
processing row 41921
processing row 41922
processing row 41923
processing row 41924
processing row 41925
processing row 41926
processing row 41927
processing row 41928
processing row 41929
processing row 41930
processing row 41931
processing row 41932
processing row 41933
processing row 41934
processing row 41935
processing row 41936
processing row 41937
processing row 41938
processing row 41939
processing row 41940
processing row 41941
processing row 41942
processing row 41943
processing row 41944
processing row 41945
processing row 41946
processing row 41947
processing row 41948
processing row 41949
processing row 41950
processing row 41951
processing row 41952
processing row 41953
processing row 41954
processing row 41955
processing row 41956
processing row 41957
processing row 41958
processing row 41959
processing row 41960
processing row 41961
processing row 41962
processing ro

processing row 45885
processing row 45886
processing row 45887
processing row 45888
processing row 45889
processing row 45890
processing row 45891
processing row 45892
processing row 45893
processing row 45894
processing row 45895
processing row 45896
processing row 45897
processing row 45898
processing row 45899
processing row 45900
processing row 45901
processing row 45902
processing row 45903
processing row 45904
processing row 45905
processing row 45906
processing row 45907
processing row 45908
processing row 45909
processing row 45910
processing row 45911
processing row 45912
processing row 45913
processing row 45914
processing row 45915
processing row 45916
processing row 45917
processing row 45918
processing row 45919
processing row 45920
processing row 45921
processing row 45922
processing row 45923
processing row 45924
processing row 45925
processing row 45926
processing row 45927
processing row 45928
processing row 45929
processing row 45930
processing row 45931
processing ro

processing row 49837
processing row 49838
processing row 49839
processing row 49840
processing row 49841
processing row 49842
processing row 49843
processing row 49844
processing row 49845
processing row 49846
processing row 49847
processing row 49848
processing row 49849
processing row 49850
processing row 49851
processing row 49852
processing row 49853
processing row 49854
processing row 49855
processing row 49856
processing row 49857
processing row 49858
processing row 49859
processing row 49860
processing row 49861
processing row 49862
processing row 49863
processing row 49864
processing row 49865
processing row 49866
processing row 49867
processing row 49868
processing row 49869
processing row 49870
processing row 49871
processing row 49872
processing row 49873
processing row 49874
processing row 49875
processing row 49876
processing row 49877
processing row 49878
processing row 49879
processing row 49880
processing row 49881
processing row 49882
processing row 49883
processing ro

In [11]:
print("loading model .....")
# load json and create model
json_file = open('/Volumes/My Passport for Mac/model/sentiment_analysis/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/Volumes/My Passport for Mac/model/sentiment_analysis/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("done")


loading model .....
Loaded model from disk
done


In [12]:
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [13]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [31]:
test_texts = ["'Parasite' (2019 release from South Korea; 132 min.) brings the story of the Kim family. As the movie opens, we get to know the family: they live in a semi-basement apartment,  and mom and dad are out of work, and their teenage son and daughter aren't in much better shape. Then one day, the son's friend Min informs him that he is leaving the country, and that it would \
               be good if the son takes over for him as the private English tutor of a HS sophomore girl, whose family is well-off, if not rich. The son agrees, and before we know it he is now the English tutor... At this point we are 10 min. into the movie but to tell you more of the plot would spoil your viewing experience, you'll just have to see for yourself how it all plays out." ,
              
               "...After all, it's already wasted enough of my time, so I'll be brief. First, this is an R-rated movie...about tag? Maybe if this was naked tag...If that was the case, this would be a very different review. Once again, I should have read the rating before buying it. I didn't do so because it had been marked down in price (for obvious reasons) and bought it impulsively. And at what point did dropping f-bombs into every sentence constitute good writing and movie-making? Assuming that was the objective. There was more swearing in this film than I remember hearing in my last view of 'The Rock' which was a much better film, even if Nic Cage was in it. Decent comedy films apparently can't be made anymore unless there is an inordinate amount of crude language and cuss words. But...this film wasn't even remotely close to decent. I have a massive film collection (everyone needs a hobby) and I've only trashed two films after purchasing and watching them. This one is number three. Don't waste your time on this one; don't rent it, and definitely don't buy it"
             ]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [32]:
nn_output = loaded_model.predict(test_data)
print(nn_output)

[[3.9897978e-01 6.0102022e-01]
 [9.9989164e-01 1.0829856e-04]]


In [33]:
i=0
for idx in np.argmax(nn_output, axis=1):
    print("Category: ", index_to_label_dict[idx])
    print("text: " , test_texts[i])
    print("=====================================")
    i = i + 1

Category:  pos
text:  'Parasite' (2019 release from South Korea; 132 min.) brings the story of the Kim family. As the movie opens, we get to know the family: they live in a semi-basement apartment,  and mom and dad are out of work, and their teenage son and daughter aren't in much better shape. Then one day, the son's friend Min informs him that he is leaving the country, and that it would                be good if the son takes over for him as the private English tutor of a HS sophomore girl, whose family is well-off, if not rich. The son agrees, and before we know it he is now the English tutor... At this point we are 10 min. into the movie but to tell you more of the plot would spoil your viewing experience, you'll just have to see for yourself how it all plays out.
Category:  neg
text:  ...After all, it's already wasted enough of my time, so I'll be brief. First, this is an R-rated movie...about tag? Maybe if this was naked tag...If that was the case, this would be a very different