In [1]:
import glob
import pandas as pd
cname = ["article", "target"]
train_df = pd.DataFrame(columns=cname)
neg = glob.glob("aclImdb/train/neg/*.txt")
for n in neg:
    f = open(n, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 0], index=cname)
    train_df = train_df.append(s, ignore_index=True)
pos = glob.glob("aclImdb/train/pos/*.txt")
for p in pos:
    f = open(p, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 1], index=cname)
    train_df = train_df.append(s, ignore_index=True)
train_df

Unnamed: 0,article,target
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0
5,"""It appears that many critics find the idea of...",0
6,The second attempt by a New York intellectual ...,0
7,"I don't know who to blame, the timid writers o...",0
8,This film is mediocre at best. Angie Harmon is...,0
9,The film is bad. There is no other way to say ...,0


In [2]:
cname = ["article", "target"]
test_df = pd.DataFrame(columns=cname)
neg = glob.glob("aclImdb/test/neg/*.txt")
for n in neg:
    f = open(n, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 0], index=cname)
    test_df = test_df.append(s, ignore_index=True)
pos = glob.glob("aclImdb/test/pos/*.txt")
for p in pos:
    f = open(p, "r", encoding="utf-8")
    article = f.read()
    f.close()
    s = pd.Series([article, 1], index=cname)
    test_df = test_df.append(s, ignore_index=True)
test_df

Unnamed: 0,article,target
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
5,"A funny thing happened to me while watching ""M...",0
6,This German horror film has to be one of the w...,0
7,"Being a long-time fan of Japanese film, I expe...",0
8,"""Tokyo Eyes"" tells of a 17 year old Japanese g...",0
9,Wealthy horse ranchers in Buenos Aires have a ...,0


In [3]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Flatten, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=100))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


In [4]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train_df["article"])
# print(tok.word_index)

In [5]:
train_df_seq = tok.texts_to_sequences(train_df["article"])
test_df_seq = tok.texts_to_sequences(test_df["article"])
pd.DataFrame(train_df_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,62,4,3,129,34,44,1414,15,3.0,514.0,...,,,,,,,,,,
1,514,14,3,159,1702,6,53,16,138.0,5.0,...,,,,,,,,,,
2,11,19,139,10,423,273,58,20,30.0,83.0,...,,,,,,,,,,
3,803,313,10,121,11,6,421,5,27.0,32.0,...,,,,,,,,,,
4,51,10,13,114,58,843,559,69,364.0,5.0,...,,,,,,,,,,
5,9,735,12,108,1415,166,1,323,4.0,3.0,...,,,,,,,,,,
6,1,330,586,31,3,159,779,8,326.0,71.0,...,,,,,,,,,,
7,10,89,121,34,5,1817,1,924,39.0,1.0,...,,,,,,,,,,
8,11,19,6,1495,30,115,6,14,160.0,14.0,...,,,,,,,,,,
9,1,19,6,75,47,6,54,82,93.0,5.0,...,,,,,,,,,,


In [6]:
from keras.preprocessing.sequence import pad_sequences
train_df_pad = pad_sequences(train_df_seq, maxlen=100)
test_df_pad = pad_sequences(test_df_seq, maxlen=100)
pd.DataFrame(train_df_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,62,4,3,...,624,31,702,84,702,378,2,67,27,107
1,161,1,113,215,84,104,55,731,714,44,...,1702,40,1609,27,11,354,39,1474,31,1
2,38,968,129,57,1,726,136,382,635,14,...,852,9,13,15,32,731,35,1906,15,620
3,6,96,1347,5,103,1,4,3,478,1402,...,33,507,297,297,238,631,4,11,6,70
4,5,1,210,118,22,40,178,5,29,4,...,22,4,338,895,900,40,11,19,112,572
5,0,0,0,0,0,0,0,0,0,0,...,677,808,4,10,59,25,5,1037,16,12
6,535,36,905,105,61,237,523,24,153,5,...,79,227,326,438,634,190,1706,90,3,17
7,5,63,94,3,17,11,97,25,74,52,...,40,15,1,794,18,15,1,153,14,70
8,0,0,0,0,0,0,0,0,0,0,...,11,944,6,1438,2,1341,12,778,125,1899
9,36,41,48,13,26,533,10,101,9,13,...,9,9,6,1347,891,22,23,3,280,334


In [7]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [8]:
model.fit(train_df_pad, train_df["target"],
          batch_size=200, epochs=3,
          validation_split=0.1, verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
 - 5s - loss: 0.5328 - acc: 0.7084 - val_loss: 0.3557 - val_acc: 0.8468
Epoch 2/3
 - 3s - loss: 0.2809 - acc: 0.8826 - val_loss: 0.3581 - val_acc: 0.8492
Epoch 3/3
 - 3s - loss: 0.1652 - acc: 0.9430 - val_loss: 0.4505 - val_acc: 0.8264


<keras.callbacks.History at 0x2210291f400>

In [9]:
model.evaluate(test_df_pad, test_df["target"])



[0.4257326912593842, 0.82576]

In [10]:
from keras.models import Model
partial = Model(inputs=model.input, outputs=model.layers[0].output)
partial.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1_input (InputLaye (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 32)           64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


In [11]:
reverse_dict = {tok.word_index[k]:k for k in tok.word_index}
print("Embedding前:", reverse_dict[test_df_pad[0][-1]])
print("Embedding後:", partial.predict(test_df_pad)[0][-1])

Embedding前: in
Embedding後: [ 0.00641045 -0.00366396 -0.03669203 -0.01787174  0.02796779  0.02570538
  0.00708964  0.00762938 -0.02474386 -0.00783962  0.04447895 -0.03434133
  0.02453271  0.01493178  0.00871584  0.04981093  0.00612995  0.00976229
  0.01852541  0.0133808  -0.04844473  0.02841581  0.01140567 -0.03559555
 -0.00398138  0.00759745  0.0193195   0.01934365 -0.04921232 -0.01349904
 -0.04270388 -0.00440304]


In [12]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN
from keras.layers import Dense, Flatten, Dropout
model = Sequential()
model.add(Embedding(2000, 32, input_length=100))
model.add(SimpleRNN(16))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(1, activation="sigmoid"))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [14]:
model.fit(train_df_pad, train_df["target"],
          batch_size=200, epochs=3,
          validation_split=0.1, verbose=2)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
 - 4s - loss: 0.5128 - acc: 0.7334 - val_loss: 0.3885 - val_acc: 0.8316
Epoch 2/3
 - 3s - loss: 0.3337 - acc: 0.8589 - val_loss: 0.5068 - val_acc: 0.7628
Epoch 3/3
 - 4s - loss: 0.2935 - acc: 0.8780 - val_loss: 0.3736 - val_acc: 0.8344


<keras.callbacks.History at 0x22135720d30>

In [15]:
model.evaluate(test_df_pad, test_df["target"])



[0.35817610693454743, 0.84096]