In [30]:
import re
import numpy as np
import random

from collections.abc import Sequence

from cached_property import cached_property
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Embedding, Dropout, Convolution1D

In [2]:
w2v = KeyedVectors.load_word2vec_format(
    '../data/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [3]:
def tokenize(text):
    return re.findall('\w+', text)

In [4]:
def sent_embed_iter(text):
    for token in tokenize(text):
        if token in w2v:
            yield w2v[token]

In [5]:
def plot_embed_iter(sents):
    for sent in sents:
        yield from sent_embed_iter(sent)

In [6]:
def parse_plots(path):
    """Generate plot sentences.
    """
    with open(path) as fh:
        
        plot = []
        for line in fh.read().splitlines():
            
            if line != '<EOS>':
                plot.append(line)
                
            else:
                yield plot
                plot = []

In [7]:
plots = list(parse_plots('../data/plots/plots'))

In [9]:
x, y = [], []

for plot in plots[:2000]:
    
    x.append(list(plot_embed_iter(plot)))
    y.append(True)
    
    shuffled = random.sample(plot, len(plot))
    
    x.append(list(plot_embed_iter(shuffled)))
    y.append(False)

In [10]:
x = pad_sequences(x, 1000, padding='post', dtype=float)

In [11]:
x.shape

(4000, 1000, 300)

In [45]:
model = Sequential()
# model.add(Convolution1D(filters=64, kernel_size=2, input_shape=x_train[0].shape))
# model.add(Dropout(0.5))
model.add(LSTM(128, input_shape=x_train[0].shape))
# model.add(LSTM(128, return_sequences=True))
# model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [46]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'],
)

In [47]:
model.fit(x_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x439b13710>

In [124]:
model.evaluate(x_test, y_test)



[0.73442685270309449, 0.54599999976158142]

In [117]:
model.metrics_names

['loss', 'acc']