__1 - Importing needed dependencies__

In [1]:
import tensorflow as tf
import keras
import os
import sys
import re
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

Using TensorFlow backend.


__2 - Declaring global variables__

In [2]:
EPOCHS = 1
SENTENCES_MAX_LENGTH = 30
BATCH_SIZE = 128
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
NUM_TRAIN_PLOTS = 3000

__3 - Text cleaning__

In [3]:
def clean(text):

    text = re.sub("[@#$+%*:()\"-]", ' ', text)
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace("\'", "")
    text = text.replace(".", " .")
    text = text.replace(",", " ,")
    text = text.replace("!", " !")
    text = text.replace("?", " ?")
    text = text.lower()

    return text

__3 - Try read data__

In [4]:
def try_read_data():
    '''
    '''  
    try:
        dirname = os.path.abspath('')
        filepath = os.path.join(dirname, 'input_data/wiki_movie_plots.csv')
        print('1 - READING FILM PLOTS ...')
        dataframe = pd.read_csv(filepath, sep=',')
        plotsList = dataframe['Plot']
        print('{} plots imported.'.format(len(plotsList)))
        plotsList = plotsList[:NUM_TRAIN_PLOTS]
        print('2 - CLEANING TEXT ...')
        for idx, p in enumerate(plotsList):
            plotsList[idx] = clean(p)
        trainPlotsList = plotsList[:NUM_TRAIN_PLOTS]    

    except IOError:
        sys.exit('Cannot find data!')
    
    return plotsList, trainPlotsList

__4 - Preprocess__

In [5]:
def preprocess(plots_list, train_plots_list, batch_size=BATCH_SIZE): 
    '''
    '''
    print('3 - PREPROCESSING SENTECES')
 
    # EXTRACT ENCODER & DECODER INPUT SENTENCES
    maxLength = SENTENCES_MAX_LENGTH
    inputSentences = []
    targetSentences = []
    outputSentences = []

    for plot in train_plots_list:
        words = plot.split(' ')

        b=True
        while b:
            if('' in words): 
                words.remove('')
            else: b = False

        sentences = [words[i:i+maxLength] for i in range(0, len(words), maxLength)]
        for s in sentences:
            for i in range(1, len(s)):
                encode_tokens, decode_tokens = s[:i], s[i:]
                encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
                output_tokens = ' '.join(decode_tokens + ['<END>'])
                decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
                inputSentences.append(encode_tokens)
                targetSentences.append(decode_tokens)
                outputSentences.append(output_tokens)
                
    
    numSamples = len(inputSentences)
    print('Num samples: {}'.format(numSamples))
    stepsPerEpoch = numSamples//batch_size
    print('StepsPerEpoch: {}'.format(stepsPerEpoch))

    # CREATE VOCABULARY OF WORDS
    idx2word = ['<PAD>','<START>', '<END>']
    for plot in plotsList:

        words = plot.split(' ')

        b=True
        while b:
            if('' in words): 
                words.remove('')
            else: b = False

        for word in words:
            if word not in idx2word:
                idx2word.append(word)

    word2idx = {}
    for word in idx2word:
        word2idx[word] = len(word2idx)
    
    vocabLength = len(idx2word)
    print('Vocabulary Size: {}'.format(vocabLength))

    # WRITE DATASET TO TXT  
    train_dataset = []

    print("Creating dataset to feed Model . . . ")
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))  
    if os.path.exists(filePath):
        os.remove(filePath) 
    
    d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
    df = pd.DataFrame(data=d) 
    df = shuffle(df)
    df.to_csv(filePath, index=False)

    print("Done.")
    
    return idx2word, vocabLength, word2idx, stepsPerEpoch, numSamples, maxLength

__5 - Data generator__

In [6]:
def data_generator(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))
    df = pd.read_csv(filePath)
    
    counter = 0

    while True:

        index = 0
        for idx, row in df.iterrows():
        
            if index >= batch_size:
                break

            if counter >= numSamples:
                break

            encoderTokens = row['input_encoder'].split(' ')
            decoderTokens = row['input_decoder'].split(' ')
            outputTokens = row['output_decoder'].split(' ')
            
            b = True
            while b:
                if('' in encoderTokens): encoderTokens.remove('')
                else: b = False

            b = True
            while b:
                if('' in decoderTokens): decoderTokens.remove('')
                else: b = False

            encoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderTargetData = np.zeros((1, max_length + 2, 1),dtype='int')
            
            for t, word in enumerate(encoderTokens):
                encoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(decoderTokens):
                decoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(outputTokens):
                # decoderTargetData is ahead of decoderInputData by one timestep
                decoderTargetData[0, t, 0] = word_2_idx[word]
                
            df.drop(df.index[[idx]])
            index = index + 1
            counter = counter + 1
            
            yield([encoderInputData,decoderInputData], decoderTargetData)

__6 - Generation function__

In [7]:
def generate_text(sentences, model, max_length, word_2_idx, idx_2_word):
    '''
    '''
    
    decoded_sentences = []
    
    for s in sentences:

        print('Generating from: {}'.format(s))
        encoderTokens = []
        s = clean(s)
        encoderwords = s.split(' ')
        for w in encoderwords:
            encoderTokens.append(word2idx[w])
        encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
        encoderInputData = np.zeros((1, max_length + 2), dtype='int64')
        
        decoded = decode(
        model,
        encoderTokens,
        start_token=word2idx['<START>'],
        end_token=word2idx['<END>'],
        pad_token=word2idx['<PAD>'],
        max_len=maxLength,
        )

        decodedPhrase = ''
        for x in decoded:
            decodedPhrase = decodedPhrase + ' ' + idx2word[x]
        
        decoded_sentences.append(decodedPhrase)
        print('Generated: {}'.format(decodedPhrase))
    
    return decoded_sentences

__7 - Logic__

In [8]:
dirname = os.path.abspath('')
transformerModelPath = os.path.join(dirname, 'models/tr_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

plotsList, trainPlotsList = try_read_data()
idx2word, vocabLength, word2idx, stepsPerEpoch, numSamples, maxLength = preprocess(
    plots_list=plotsList,
    train_plots_list=trainPlotsList
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=16,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((len(word2idx), EMBEDDING_DIM)),
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    trainGen = data_generator(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=maxLength, 
            vocab_length=vocabLength
        )

    # Train the model
    model.fit_generator(
            trainGen,
            steps_per_epoch=stepsPerEpoch,
            epochs=EPOCHS
            )

    decoded_sentences = generate_text(
        sentences = ['on a beautiful summer day three people'], 
        model = model, 
        max_length = maxLength, 
        word_2_idx = word2idx,
        idx_2_word = idx2word
    )
    
    print(decoded_sentences[0])

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')



1 - READING FILM PLOTS ...
34886 plots imported.
2 - CLEANING TEXT ...
3 - PREPROCESSING SENTECES
Num samples: 849258
StepsPerEpoch: 6634
Vocabulary Size: 32588
Creating dataset to feed Model . . . 
Done.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embeddin

In [9]:
dirname = os.path.abspath('')
transformerModelPath = os.path.join(dirname, 'models/tr_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

resultsModelPatht = os.path.join(dirname, 'output_data/tr_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

model.load_weights(transformerModelPath)

sentences = ['On a beautiful summer day three people', 
    'The terrified servant leaves the rifle' , 
    'The president is in trouble']

decoded_sentences = generate_text(
    sentences = sentences,
    model = model, 
    max_length = maxLength,
    word_2_idx = word2idx, 
    idx_2_word = idx2word, 
)

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPatht, index=False)

Generating from: On a beautiful summer day three people
Generated:  <START> the kidnapping , while the ship with his granddaughter sally . the hope of finding a train owned by traveling <END>
Generating from: The terrified servant leaves the rifle
Generated:  <START> the kidnapping , while the ship with his granddaughter sally . the hope of finding a train owned by traveling <END>
Generating from: The president is in trouble
Generated:  <START> the kidnapping , while the ship with his granddaughter sally . the hope of finding a surprised and never revealed the cover of her father <END>
