In [1]:
import glob
import pandas as pd
cname = ["article", "target"]
train_df = pd.DataFrame(columns=cname)
neg = glob.glob("aclImdb/train/neg/*.txt")
for n in neg:
    f = open(n, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 0], index=cname)
    train_df = train_df.append(s, ignore_index=True)
pos = glob.glob("aclImdb/train/pos/*.txt")
for p in pos:
    f = open(p, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 1], index=cname)
    train_df = train_df.append(s, ignore_index=True)
train_df

Unnamed: 0,article,target
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0
5,I had been looking forward to seeing this film...,0
6,Effect(s) without cause is generally not possi...,0
7,"This picture started out with good intentions,...",0
8,I chose to see this movie because it got a goo...,0
9,This film has to be the worst I have ever seen...,0


In [2]:
cname = ["article", "target"]
test_df = pd.DataFrame(columns=cname)
neg = glob.glob("aclImdb/test/neg/*.txt")
for n in neg:
    f = open(n, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 0], index=cname)
    test_df = test_df.append(s, ignore_index=True)
pos = glob.glob("aclImdb/test/pos/*.txt")
for p in pos:
    f = open(p, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 1], index=cname)
    test_df = test_df.append(s, ignore_index=True)
test_df

Unnamed: 0,article,target
0,Alan Rickman & Emma Thompson give good perform...,0
1,I have seen this movie and I did not care for ...,0
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0
3,"This film is bundled along with ""Gli fumavano ...",0
4,I only comment on really very good films and o...,0
5,When you look at the cover and read stuff abou...,0
6,Rollerskating vampires?! I'm sorry but even fo...,0
7,"Technically abominable (with audible ""pops"" be...",0
8,"When Hollywood is trying to grasp what an ""int...",0
9,Respected western auteur Budd Boetticher is wo...,0


In [9]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Flatten, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=100))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


In [10]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train_df["article"])
# print(tok.word_index)

In [11]:
train_df_seq = tok.texts_to_sequences(train_df["article"])
test_df_seq = tok.texts_to_sequences(test_df["article"])
pd.DataFrame(train_df_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,777,16,28,4,1,115,11,19,1025.0,5.0,...,,,,,,,,,,
1,70,10,1,201,642,122,8,2,10.0,255.0,...,,,,,,,,,,
2,11,28,13,3,224,1347,5,866,140.0,9.0,...,,,,,,,,,,
3,204,107,46,99,8,58,110,18,11.0,28.0,...,,,,,,,,,,
4,1157,1,4,104,490,2,104,8,3.0,16.0,...,,,,,,,,,,
5,10,66,74,264,926,5,316,11,19.0,15.0,...,,,,,,,,,,
6,959,587,206,1200,6,1224,21,611,8.0,1.0,...,,,,,,,,,,
7,11,428,642,43,16,49,1,1651,43.0,5.0,...,,,,,,,,,,
8,10,5,64,11,17,85,9,185,3.0,49.0,...,,,,,,,,,,
9,11,19,44,5,27,1,246,10,25.0,123.0,...,,,,,,,,,,


In [12]:
from keras.preprocessing.sequence import pad_sequences
train_df_pad = pad_sequences(train_df_seq, maxlen=100)
test_df_pad = pad_sequences(test_df_seq, maxlen=100)
pd.DataFrame(train_df_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,36,463,2,222,3,1016,174,20,49,808
1,3,75,17,8,189,9,562,27,57,1189,...,10,380,47,23,768,5,166,180,12,70
2,866,140,9,44,3,1033,2,1135,860,18,...,518,242,1927,36,11,17,45,30,29,611
3,257,709,177,47,35,26,6,1,333,4,...,1816,95,924,899,20,22,7,7,297,155
4,651,137,229,5,229,7,7,1,104,904,...,31,43,22,63,89,456,48,568,5,1810
5,7,7,20,1,1120,496,2,68,774,8,...,63,456,48,571,5,95,7,7,339,155
6,19,1838,163,32,586,5,758,139,5,1,...,4,3,49,62,302,4,40,15,65,202
7,0,0,0,0,0,0,0,0,0,0,...,12,13,49,2,6,70,1022,16,11,944
8,199,3,4,1,111,18,130,268,9,418,...,28,4,1,246,99,204,123,107,785,242
9,19,1,308,80,533,47,276,437,1,62,...,5,816,11,19,92,78,35,92,1,1036


In [13]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [14]:
model.fit(train_df_pad, train_df["target"],
          batch_size=200, epochs=3,
          validation_split=0.1, verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
 - 3s - loss: 0.5185 - acc: 0.7188 - val_loss: 0.3575 - val_acc: 0.8480
Epoch 2/3
 - 3s - loss: 0.2768 - acc: 0.8853 - val_loss: 0.4461 - val_acc: 0.8020
Epoch 3/3
 - 2s - loss: 0.1614 - acc: 0.9449 - val_loss: 0.5388 - val_acc: 0.7932


<keras.callbacks.History at 0x126463cc0>

In [15]:
model.evaluate(test_df_pad, test_df["target"])



[0.4463807354068756, 0.8224]

In [19]:
from keras.models import Model
partial = Model(inputs=model.input, outputs=model.layers[0].output)
partial.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2_input (InputLaye (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 32)           64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


In [40]:
reverse_dict = {tok.word_index[k]:k for k in tok.word_index}
print("Embedding前:", reverse_dict[test_df_pad[0][-1]])
print("Embedding後:", partial.predict(test_df_pad)[0][-1])

Embedding前: and
Embedding後: [-0.01033766  0.03140927  0.02222989  0.02868232  0.02750265 -0.02478197
 -0.01922186 -0.05038073 -0.00903559 -0.02837285  0.04578851  0.0227078
 -0.00707238 -0.00525953 -0.06136559  0.02909667  0.02389028 -0.01980509
  0.03031231 -0.02337675  0.00760312 -0.03395703 -0.03945349 -0.02929432
  0.02693814 -0.04774382 -0.00255563  0.0154931  -0.02970378  0.00523549
 -0.00476184 -0.04773061]


In [43]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN
from keras.layers import Dense, Flatten, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=100))
model.add(SimpleRNN(16))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_7 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [47]:
model.fit(train_df_pad, train_df["target"],
          batch_size=200, epochs=3,
          validation_split=0.1, verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
 - 4s - loss: 0.2687 - acc: 0.8919 - val_loss: 0.4714 - val_acc: 0.7984
Epoch 2/3
 - 4s - loss: 0.2234 - acc: 0.9143 - val_loss: 0.4698 - val_acc: 0.8116
Epoch 3/3
 - 4s - loss: 0.1895 - acc: 0.9311 - val_loss: 0.5094 - val_acc: 0.8040


<keras.callbacks.History at 0x127957fd0>

In [48]:
model.evaluate(test_df_pad, test_df["target"])



[0.4443703394317627, 0.82376]