- parse corpus
- story class, list of sentences
- shuffled_sentences method
- step through all words, map to embedding, make variable length tensor for whole plot
- predict correct / shuffled

In [74]:
import re
import numpy as np
import random

from collections.abc import Sequence

from cached_property import cached_property
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, Embedding, Dropout

In [2]:
w2v = KeyedVectors.load_word2vec_format(
    '../data/GoogleNews-vectors-negative300.bin.gz',
    binary=True,
)

In [3]:
def tokenize(text):
    return re.findall('\w+', text)

In [4]:
def sent_embed_iter(text):
    for token in tokenize(text):
        if token in w2v:
            yield w2v[token]

In [5]:
def plot_embed_iter(sents):
    for sent in sents:
        yield from sent_embed_iter(sent)

In [6]:
def parse_plots(path):
    """Generate plot sentences.
    """
    with open(path) as fh:
        
        plot = []
        for line in fh.read().splitlines():
            
            if line != '<EOS>':
                plot.append(line)
                
            else:
                yield plot
                plot = []

In [7]:
plots = list(parse_plots('../data/plots/plots'))

In [107]:
x, y = [], []

for plot in plots[:1000]:
    
    x.append(list(plot_embed_iter(plot)))
    y.append(True)
    
    shuffled = random.sample(plot, len(plot))
    
    x.append(list(plot_embed_iter(shuffled)))
    y.append(False)

In [108]:
x = pad_sequences(x, 1000, padding='post', dtype=float)

In [109]:
x.shape

(2000, 1000, 300)

In [110]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [111]:
x_train.shape

(1500, 1000, 300)

In [112]:
x_test.shape

(500, 1000, 300)

In [113]:
model = Sequential()
model.add(LSTM(128, input_shape=x_train[0].shape))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [114]:
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy'],
)

In [115]:
model.fit(x_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2864e37b8>

In [116]:
model.evaluate(x_test, y_test)



[0.7075846290588379, 0.48199999952316286]

In [117]:
model.metrics_names

['loss', 'acc']