In [1]:
from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.metrics import categorical_accuracy

# spacy is used to work on text
import spacy

#import other libraries
import numpy as np
import pandas as pd
import random
import sys
import os
import time
import codecs
import collections
import string
import re
from six.moves import cPickle

Using TensorFlow backend.


In [2]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')

In [25]:
df.Plot[0]#.split(' ')

"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"

In [3]:
def create_wordlist(doc):
    wl = []
    for word in re.split(' |\r\n',doc):
        word = word.lower()
        punc = string.punctuation.replace("'", "")
        if '[' in word and ']' in word:
            start = word.index('[')
            end = word.index(']')
            word = word[:start] + word[end+1:]
        for i in punc:
            word = word.replace(i, "")
        wl.append(word.lower())
    return wl

In [17]:
df['Plot'].shape

(34886,)

In [5]:
len(df.Plot[1].split(' '))

86

In [6]:
start = time.time()
wordlist = []

for i in df['Plot'][:100]:
    wl = create_wordlist(i)
    wordlist = wordlist + wl
print("Wall time: {} seconds".format(time.time() - start))

Wall time: 0.08117413520812988 seconds


In [7]:
len(wordlist)

15339

In [110]:
len(wordlist)

12993831

In [19]:
with open('word_list.txt', 'rb') as f:
    wordlist = cPickle.load(f)

In [20]:
wordlist = wordlist[:10000]

In [37]:
# count the number of words
word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

# #save the words and vocabulary
# with open('word_dict.txt', 'wb') as f:
#     cPickle.dump((words, vocab, vocabulary_inv), f)

vocab size:  2712


In [23]:
rnn_size = 256 # size of RNN
seq_length = 30 # sequence length
learning_rate = 0.001 #learning rate
sequences_step = 1 #step to create sequences

In [7]:
# with open('word_dict.txt', 'rb') as f:
#     w = cPickle.load(f)

In [24]:
#create sequences
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 9970


In [25]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

In [26]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    print("model built!")
    return model

In [28]:
rnn_size = 256 # size of RNN
seq_length = 30 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
model built!
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 512)               6080512   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2712)              1391256   
_________________________________________________________________
activation_2 (Activation)    (None, 2712)              0         
Total params: 7,471,768
Trainable params: 7,471,768
Non-trainable params: 0
_________________________________________________________________


In [29]:
batch_size = 32 # minibatch size
num_epochs = 50 # number of epochs

# callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
#            ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
#                            monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
#                  callbacks=callbacks,
                 validation_split=0.1)


Train on 8973 samples, validate on 997 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
#save the model
md.save('my_model_generate_sentences.h5')

In [31]:
#load vocabulary
print("loading vocabulary...")
# vocab_file = os.path.join(save_dir, "words_vocab.pkl")

 
with open('word_dictt.txt', 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

with open('word_dict.txt', 'rb') as f:
    words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

from keras.models import load_model
# load the model
print("loading model...")
model = load_model('my_model_generate_sentences.h5')

loading vocabulary...
loading model...


In [32]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [36]:
len(vocab)

193201

In [58]:
df.Plot[1]

"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."

In [60]:
words_number = 30 # number of words to generate
seed_sentences = "he is walking past " #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)

a a a a a a a a a a a a a a a a a a a a a a a a a a he is walking past and a as his bob himself from a time and they enter the husband of a herself and her to his rival goldberg in the henry of a santa claus


In [53]:
generated[52:]

"he is walking past and a fence learn on a passing and an trusting boone and a thus when he has to onward and the into the moon's as he is a summer that"

In [56]:
words_number = 30 # number of words to generate
seed_sentences = "john would always go up the stairs only to see" #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, 0.33)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]

#print the whole text
print(generated)

a a a a a a a a a a a a a a a a a a a a john would always go up the stairs only to see and fight on the santa claus she jack home to found the tramp and time and an she is has to the spread mollie he be be a new york


In [59]:
generated[40:]

'john would always go up the stairs only to see and fight on the santa claus she jack home to found the tramp and time and an she is has to the spread mollie he be be a new york'