__1 - Importing needed dependencies__

In [1]:
import tensorflow as tf
import keras
import os
import sys
import re
import numpy as np
import pandas as pd
import random
import gensim
from gensim.models import Word2Vec
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

Using TensorFlow backend.


__2 - Declaring global variables__

In [2]:
EPOCHS = 3
SENTENCES_MAX_LENGTH = 30
BATCH_SIZE = 1
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
NUM_TRAIN_PLOTS = 2

__3 - Try read data__

In [3]:
def clean(text):
    '''
    '''
    text = text.strip()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace(",", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("-", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = text.lower()
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/wiki_movie_plots.csv')
    dataframe = pd.read_csv(filepath, sep=',')
    plotsList = dataframe['Plot'].values
    print('{} plots imported.'.format(len(plotsList)))
    for idx, p in enumerate(plotsList):
        plotsList[idx] = clean(p)
    trainPlotsList = plotsList[:NUM_TRAIN_PLOTS] 
    print('Plots cleaned and training set ready.')

except IOError:
    sys.exit('Cannot find data!')

34886 plots imported.
Plots cleaned and training set ready.


__4 - Extract vocabulary__

In [4]:
# CREATE VOCABULARY OF WORDS
idx2word = []
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}

for plot in trainPlotsList:
    words = plot.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    for word in words:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

for word in idx2word:
    word2idx[word] = len(word2idx)

idx2word = list(word2idx.keys())
print(idx2word[:3])

vocabLength = len(idx2word)
print('Vocabulary Size: {}'.format(vocabLength))

['<PAD>', '<START>', '<END>']
Vocabulary Size: 107


__Train embeddings__

In [5]:
if os.path.exists('./embeddings/plots_word2vec_skipgram_128.bin'):
    print('Plots Embeddings have already been trained')
else :
    sentences = []

    for plot in trainPlotsList:
        words = plot.split(' ')
        sentences.append(words)

    model = Word2Vec(sentences, min_count=1, sg=1, size=128)
    words = list(model.wv.vocab)
    print('{} WORDS '.format(len(words)))
    print('Printing first 100:')
    print(words[:100])

    model.save('embeddings/plots_word2vec_skipgram_128.bin')

Plots Embeddings have already been trained


__Extract embeddings matrix__

In [6]:
# Recreating embeddings index based on Tokenizer vocabulary
word2vecModel = gensim.models.Word2Vec.load('embeddings/plots_word2vec_skipgram_128.bin')
word2vec_vocabulary = word2vecModel.wv.vocab
embeddingIndex = dict()
counter = 0
for i, word in enumerate(idx2word):
    if word in word2vec_vocabulary :
        embeddingIndex[word] = word2vecModel[word]
    else:
        counter += 1

print("{} words without pre-trained embedding!".format(counter))
    
# Prepare embeddings matrix
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))
for i, word in enumerate(idx2word):
    embeddingVector = embeddingIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

3 words without pre-trained embedding!


  


__Or use random weights__

In [None]:
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))

__4 - Preprocess__

In [7]:
# EXTRACT ENCODER & DECODER INPUT SENTENCES
inputSentences = []
targetSentences = []
outputSentences = []

for plot in trainPlotsList:
    words = plot.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    sentences = [words[i:i+SENTENCES_MAX_LENGTH] for i in range(0, len(words), SENTENCES_MAX_LENGTH)]
    for s in sentences:
        for i in range(1, len(s)):
            encode_tokens, decode_tokens = s[:i], s[i:]
            encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
            output_tokens = ' '.join(decode_tokens + ['<END>'])
            decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
            inputSentences.append(encode_tokens)
            targetSentences.append(decode_tokens)
            outputSentences.append(output_tokens)


numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))
stepsPerEpoch = numSamples//BATCH_SIZE
print('StepsPerEpoch: {}'.format(stepsPerEpoch))

# WRITE DATASET TO CSV
train_dataset = []

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_dense_{}_{}_{}_{}_{}_{}.csv'.format(
EPOCHS, 
SENTENCES_MAX_LENGTH, 
BATCH_SIZE, 
EMBEDDING_DIM,
HIDDEN_DIM,
NUM_TRAIN_PLOTS)))
if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
df = shuffle(df)
df.to_csv(filePath, index=False)

print("Done.")

Num samples: 174
StepsPerEpoch: 174
Creating dataset to feed Model . . . 
Done.


__5 - Data generator__

In [None]:
def data_generator(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_dense_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))
    df = pd.read_csv(filePath)
    
    counter = 0

    while True:

        index = 0
        for idx, row in df.iterrows():
        
            if index >= batch_size:
                break

            if counter >= numSamples:
                break

            encoderTokens = row['input_encoder'].split(' ')
            decoderTokens = row['input_decoder'].split(' ')
            outputTokens = row['output_decoder'].split(' ')
            
            b = True
            while b:
                if('' in encoderTokens): encoderTokens.remove('')
                else: b = False

            b = True
            while b:
                if('' in decoderTokens): decoderTokens.remove('')
                else: b = False

            encoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderTargetData = np.zeros((1, max_length + 2, 1),dtype='int')
            
            for t, word in enumerate(encoderTokens):
                encoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(decoderTokens):
                decoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(outputTokens):
                # decoderTargetData is ahead of decoderInputData by one timestep
                decoderTargetData[0, t, 0] = word_2_idx[word]
                
            df.drop(df.index[[idx]])
            index = index + 1
            counter = counter + 1
            
            yield([encoderInputData,decoderInputData], decoderTargetData)

__6 - Train the model__

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_dense_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=16,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    trainGen = data_generator(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=SENTENCES_MAX_LENGTH, 
            vocab_length=vocabLength
        )

    # Train the model
    model.fit_generator(
            trainGen,
            steps_per_epoch=stepsPerEpoch,
            epochs=EPOCHS
            )

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embedding (EmbeddingRet)  [(None, None, 128),  13696       Encoder-Input[0][0]              
                                                                 Decoder-Input[0][0]              
_____________________

  1/174 [..............................] - ETA: 7:52 - loss: 9.7155['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
  4/174 [..............................] - ETA: 1:58 - loss: 6.9423['

 28/174 [===>..........................] - ETA: 17s - loss: 2.6761['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
 31/174 [====>.........................] - ETA: 15s - loss: 2.4475['<S

['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', '

['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 

['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 

['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'a', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave', '.', '<END>']
['<START>', 'mirrors', 'and', 'breaking', '<END>']
['<START>', 'the', 'cash', 'register', '.', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 

Epoch 2/3


__7 - Generate sentences__

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_dense_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=16,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.load_weights(transformerModelPath)

sentences = [
    'On a beautiful day three people',
    'On a rainy day', 
    'On a rainy day a strange man' , 
    'On a rainy day a strange man was walking',
    'On a rainy day a strange man was walking along the road' ,
    'During the election, in which he is elected governor, she is watching him',
    'Emily, who has no real job, is attracted to Ned'
]

decoded_sentences = []
    
for s in sentences:

    print('Generating from: {}'.format(s))
    encoderTokens = []
    s = clean(s)
    encoderwords = s.split(' ')
    for w in encoderwords:
        encoderTokens.append(word2idx[w])
    encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
    encoderInputData = np.zeros((1, SENTENCES_MAX_LENGTH + 2), dtype='int64')

    decoded = decode(
    model,
    encoderTokens,
    start_token=word2idx['<START>'],
    end_token=word2idx['<END>'],
    pad_token=word2idx['<PAD>'],
    max_len=SENTENCES_MAX_LENGTH,
    )

    decodedPhrase = ''
    for x in decoded:
        decodedPhrase = decodedPhrase + ' ' + idx2word[x]

    decoded_sentences.append(decodedPhrase)
    print('Generated: {}'.format(decodedPhrase))

resultsModelPatht = os.path.join(dirname, 'output_data/out_tr_dense_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPatht, index=False)