# 自然語言處理&RNN預測情緒 on IMDb影評

## 載入IMDb資料集

In [2]:
import urllib.request
import os
import tarfile

In [3]:
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
path = "aclImdb_v1.tar.gz"
if not os.path.isfile(path):
    result = urllib.request.urlretrieve(url, path)
    print("downloaded")


downloaded


In [4]:
if not os.path.exists("aclImdb"):
    tfile = tarfile.open("aclImdb_v1.tar.gz", "r:gz")
    result = tfile.extractall('')


## data preprocessing

In [7]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [8]:
import re

def remove_tag(text):
    re_tag = re.compile(r'(<[^>]+>|\.|\,)')
    return re_tag.sub('', text)


In [10]:
import os

def read_files(filetype):
    path = "aclImdb/"
    file_list = []
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]

    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]

    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []

    for fi in file_list:
        with open(fi, encoding='utf-8') as file_input:
            all_texts += [remove_tag(" ".join(file_input.readlines()))]

    return all_labels, all_texts


y_train, train_text = read_files("train")
y_test, test_text = read_files("test")
o_train_text = train_text
o_test_text = test_text


## check data and data type

In [11]:
print("training data size:%d" % (len(train_text)))
print("testing data size:%d" % (len(test_text)))
print("feature :%s" % (train_text[0]))
print("label :%s" % (y_train[0]))


training data size:25000
testing data size:25000
feature :Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as "Teachers" My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers" The scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled  at  High A classic line: INSPECTOR: I'm here to sack one of your teachers STUDENT: Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn't!
label :1


### 建立token 
對於每個word建立一個index

In [15]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)


In [20]:
# display word index table
print(token.word_index)


In [23]:
# convert text to vector
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

# compare the text and vector
print(train_text[0])
print('\n')
print(x_train_seq[0])
print('\n')

# padding 
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)

# show padding result
print("before length: %d" % (len(x_train_seq[0])))
print("before sequence: %s" % (x_train_seq[0]))
print("\n")
print("after length: %d" % (len(x_train[0])))
print("after sequence: %s" % (x_train[0]))


Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as "Teachers" My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers" The scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled  at  High A classic line: INSPECTOR: I'm here to sack one of your teachers STUDENT: Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn't!


[301, 6, 3, 1070, 212, 8, 29, 1, 168, 56, 13, 45, 81, 40, 388, 113, 134, 13, 58, 149, 7, 1, 472, 68, 5, 256, 11, 1984, 6, 72, 5, 636, 70, 6, 1, 5, 1, 1515, 33, 66, 64, 203, 140, 63, 1248, 1, 4, 1, 218, 915, 28, 68, 4, 1, 10

before length: 107
before sequence: [301, 6, 3, 1070, 212, 8, 29, 1, 168, 56, 13, 45, 81, 40, 388, 113, 134, 13, 58, 149, 7, 1, 472, 68, 5, 256, 11, 1984, 6, 72, 5, 636, 70, 6, 1, 5, 1, 1515, 33, 66, 64, 203, 140, 63, 1248, 1, 4, 1, 218, 915, 28, 68, 4, 1, 10, 683, 2, 63, 1515, 51, 10, 209, 1, 391, 7, 59, 3, 1463, 789, 5, 178, 1, 388, 10, 1223, 29, 301, 3, 354, 341, 146, 132, 5, 27, 4, 124, 1463, 5, 301, 10, 525, 11, 106, 1487, 4, 58, 555, 100, 11, 301, 6, 225, 46, 3, 11, 8, 210]


after length: 100
after sequence: [   1  168   56   13   45   81   40  388  113  134   13   58  149    7
    1  472   68    5  256   11 1984    6   72    5  636   70    6    1
    5    1 1515   33   66   64  203  140   63 1248    1    4    1  218
  915   28   68    4    1   10  683    2   63 1515   51   10  209    1
  391    7   59    3 1463  789    5  178    1  388   10 1223   29  301
    3  354  341  146  132    5   27    4  124 1463    5  301   10  525
   11  106 1487    4   58  555  100   11  301    6  2

## build pure RNN model

In [24]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, LSTM


In [25]:
model = Sequential()
model.add(Embedding(output_dim=32,
                    input_dim=2000,
                    input_length=100))
model.add(Dropout(0.35))

# Fully-connected RNN where the output is to be fed back to input.
model.add(SimpleRNN(units=16))  # 32*16 + 16 +16*16
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1, activation='sigmoid'))


In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])


## train model

In [28]:
train_history = model.fit(x_train, y_train, batch_size=100,
                          epochs=10, verbose=2,
                          validation_split=0.2)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10


 - 38s - loss: 0.5487 - acc: 0.7169 - val_loss: 0.4495 - val_acc: 0.8188


Epoch 2/10


 - 15s - loss: 0.3755 - acc: 0.8394 - val_loss: 0.5606 - val_acc: 0.7502


Epoch 3/10


 - 15s - loss: 0.3280 - acc: 0.8635 - val_loss: 0.4177 - val_acc: 0.8174


Epoch 4/10


 - 16s - loss: 0.2867 - acc: 0.8827 - val_loss: 0.4529 - val_acc: 0.8180


Epoch 5/10


 - 15s - loss: 0.2566 - acc: 0.8958 - val_loss: 0.5137 - val_acc: 0.8096


Epoch 6/10


 - 15s - loss: 0.2302 - acc: 0.9080 - val_loss: 0.6379 - val_acc: 0.7386


Epoch 7/10


 - 15s - loss: 0.2027 - acc: 0.9195 - val_loss: 0.8288 - val_acc: 0.7586


Epoch 8/10


 - 15s - loss: 0.1843 - acc: 0.9279 - val_loss: 0.5798 - val_acc: 0.8348


Epoch 9/10


 - 15s - loss: 0.1704 - acc: 0.9339 - val_loss: 0.6810 - val_acc: 0.7698


Epoch 10/10


 - 15s - loss: 0.1478 - acc: 0.9424 - val_loss: 0.8034 - val_acc: 0.7438


## evaluate & check result

In [29]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]


   32/25000 [..............................] - ETA: 41s

   64/25000 [..............................] - ETA: 1:37

   96/25000 [..............................] - ETA: 1:19

  128/25000 [..............................] - ETA: 1:19

  160/25000 [..............................] - ETA: 1:14

  192/25000 [..............................] - ETA: 1:11

  224/25000 [..............................] - ETA: 1:09

  256/25000 [..............................] - ETA: 1:06

  288/25000 [..............................] - ETA: 1:04

  320/25000 [..............................] - ETA: 1:01

  352/25000 [..............................] - ETA: 59s 



  416/25000 [..............................] - ETA: 56s

  480/25000 [..............................] - ETA: 53s

  544/25000 [..............................] - ETA: 51s

  608/25000 [..............................] - ETA: 48s

  672/25000 [..............................] - ETA: 47s

  704/25000 [..............................] - ETA: 46s

  768/25000 [..............................] - ETA: 45s

  832/25000 [..............................] - ETA: 44s

  896/25000 [>.............................] - ETA: 42s

  960/25000 [>.............................] - ETA: 42s

 1024/25000 [>.............................] - ETA: 41s

 1088/25000 [>.............................] - ETA: 40s

 1152/25000 [>.............................] - ETA: 39s

 1216/25000 [>.............................] - ETA: 39s

 1280/25000 [>.............................] - ETA: 38s

 1344/25000 [>.............................] - ETA: 38s

 1408/25000 [>.............................] - ETA: 37s

 1472/25000 [>.............................] - ETA: 37s

 1536/25000 [>.............................] - ETA: 37s

 1600/25000 [>.............................] - ETA: 36s

 1664/25000 [>.............................] - ETA: 36s

 1728/25000 [=>............................] - ETA: 36s

 1792/25000 [=>............................] - ETA: 35s

 1856/25000 [=>............................] - ETA: 35s

 1920/25000 [=>............................] - ETA: 35s

 1952/25000 [=>............................] - ETA: 35s

 2016/25000 [=>............................] - ETA: 34s

 2080/25000 [=>............................] - ETA: 34s

 2144/25000 [=>............................] - ETA: 34s

 2208/25000 [=>............................] - ETA: 34s

 2272/25000 [=>............................] - ETA: 33s

 2336/25000 [=>............................] - ETA: 33s

 2400/25000 [=>............................] - ETA: 33s

 2464/25000 [=>............................] - ETA: 33s

 2528/25000 [==>...........................] - ETA: 33s

 2592/25000 [==>...........................] - ETA: 32s

 2656/25000 [==>...........................] - ETA: 32s

 2720/25000 [==>...........................] - ETA: 32s

 2784/25000 [==>...........................] - ETA: 32s

 2848/25000 [==>...........................] - ETA: 32s

 2912/25000 [==>...........................] - ETA: 31s

 2976/25000 [==>...........................] - ETA: 31s

 3040/25000 [==>...........................] - ETA: 31s

 3104/25000 [==>...........................] - ETA: 31s

 3168/25000 [==>...........................] - ETA: 31s

 3232/25000 [==>...........................] - ETA: 31s

 3264/25000 [==>...........................] - ETA: 31s

 3328/25000 [==>...........................] - ETA: 31s

 3392/25000 [===>..........................] - ETA: 30s

 3456/25000 [===>..........................] - ETA: 30s

 3520/25000 [===>..........................] - ETA: 30s

 3584/25000 [===>..........................] - ETA: 30s

 3648/25000 [===>..........................] - ETA: 30s

 3680/25000 [===>..........................] - ETA: 30s

 3744/25000 [===>..........................] - ETA: 30s

 3808/25000 [===>..........................] - ETA: 30s

 3840/25000 [===>..........................] - ETA: 30s

 3904/25000 [===>..........................] - ETA: 30s

 3968/25000 [===>..........................] - ETA: 29s

 4032/25000 [===>..........................] - ETA: 29s

 4096/25000 [===>..........................] - ETA: 29s

 4160/25000 [===>..........................] - ETA: 29s

 4224/25000 [====>.........................] - ETA: 29s

 4288/25000 [====>.........................] - ETA: 29s

 4352/25000 [====>.........................] - ETA: 29s

 4416/25000 [====>.........................] - ETA: 28s

 4480/25000 [====>.........................] - ETA: 28s

 4512/25000 [====>.........................] - ETA: 28s

 4576/25000 [====>.........................] - ETA: 28s

 4640/25000 [====>.........................] - ETA: 28s

 4704/25000 [====>.........................] - ETA: 28s

 4768/25000 [====>.........................] - ETA: 28s

 4832/25000 [====>.........................] - ETA: 28s

 4896/25000 [====>.........................] - ETA: 27s

 4960/25000 [====>.........................] - ETA: 27s

 5024/25000 [=====>........................] - ETA: 27s

 5088/25000 [=====>........................] - ETA: 27s

 5152/25000 [=====>........................] - ETA: 27s

 5216/25000 [=====>........................] - ETA: 27s

 5280/25000 [=====>........................] - ETA: 27s

 5344/25000 [=====>........................] - ETA: 27s

 5408/25000 [=====>........................] - ETA: 27s

 5472/25000 [=====>........................] - ETA: 26s

 5536/25000 [=====>........................] - ETA: 26s

 5600/25000 [=====>........................] - ETA: 26s

 5664/25000 [=====>........................] - ETA: 26s

 5728/25000 [=====>........................] - ETA: 26s

 5792/25000 [=====>........................] - ETA: 26s



















































































































































































































































































































































































































































































































































































































































0.80476

In [30]:
predict = model.predict_classes(x_test).reshape(-1)


In [31]:
SentimentDict = {1: "positive", 0: "negative"}
def display_test_Sentiment(i):
    print(test_text[i])
    print("truth:", SentimentDict[y_test[i]], "predict:", SentimentDict[predict[i]])


In [32]:
display_test_Sentiment(1)

Actor turned director Bill Paxton follows up his promising debut the Gothic-horror "Frailty" with this family friendly sports drama about the 1913 US Open where a young American caddy rises from his humble background to play against his Bristish idol in what was dubbed as "The Greatest Game Ever Played" I'm no fan of golf and these scrappy underdog sports flicks are a dime a dozen (most recently done to grand effect with "Miracle" and "Cinderella Man") but some how this film was enthralling all the sameThe film starts with some creative opening credits (imagine a Disneyfied version of the animated opening credits of HBO's "Carnivale" and "Rome") but lumbers along slowly for its first by-the-numbers hour Once the action moves to the US Open things pick up very well Paxton does a nice job and shows a knack for effective directorial flourishes (I loved the rain-soaked montage of the action on day two of the open) that propel the plot further or add some unexpected psychological depth to t

## predict on real data

In [33]:
review = (
        "This film has been saved by its stars: Ryan Reynolds first and Kevin Costner few minutes later will make you forget most of plot's unbelievable logical holes, awful editing (what's Alice Eve's end?), repetitive scenes and complete lack of bad guy's motive (Jordi Mollà)." +

        "And Tommy Lee Jones, Gal Gadot, Gary Oldman and Michael Pitt (Alice Eve here is little more than an extra) strive to fill with workmanship and dignity their otherwise gaunt supporting roles." +

        "They all succeed in saving the day and in the end you won't fall asleep or leave theater in contempt. But on your way home you could probably comment that this is one of the most useless (or unconvincing) films you ever watched.")


In [34]:
def preprocessor(text):
    x_test_1_seq = token.texts_to_sequences([text])
    x_test_1_seq_pad = sequence.pad_sequences(x_test_1_seq, maxlen=100)
    return x_test_1_seq_pad


In [35]:
x_test_1 = preprocessor(review)

In [36]:
predict = model.predict_classes(x_test_1)

In [37]:
print(review)
print("predict:",SentimentDict[predict[0][0]])

This film has been saved by its stars: Ryan Reynolds first and Kevin Costner few minutes later will make you forget most of plot's unbelievable logical holes, awful editing (what's Alice Eve's end?), repetitive scenes and complete lack of bad guy's motive (Jordi Mollà).And Tommy Lee Jones, Gal Gadot, Gary Oldman and Michael Pitt (Alice Eve here is little more than an extra) strive to fill with workmanship and dignity their otherwise gaunt supporting roles.They all succeed in saving the day and in the end you won't fall asleep or leave theater in contempt. But on your way home you could probably comment that this is one of the most useless (or unconvincing) films you ever watched.
predict: negative
