__1 - Importing needed dependencies__

In [11]:
import tensorflow as tf
import keras
import os
import sys
import re
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

__2 - Declaring global variables__

In [23]:
EPOCHS = 1
SENTENCES_MAX_LENGTH = 10
BATCH_SIZE = 128
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
NUM_TRAIN_PLOTS = 2691

__3 - Try read data__

In [24]:
def clean(text):
    '''
    '''
    text = text.strip()
    text = text.replace("'", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace("(", " ( ")
    text = text.replace(")", " ) ")
    text = text.replace(",", " , ")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("-", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = text.lower()
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/dati_stamperia.csv')
    dataframe = pd.read_csv(filepath, sep=',')
    plotsList = dataframe['text']
    print('{} plots imported.'.format(len(plotsList)))
    plotsList = plotsList[:NUM_TRAIN_PLOTS]
    for idx, p in enumerate(plotsList):
        plotsList[idx] = clean(p)
    trainPlotsList = plotsList[:NUM_TRAIN_PLOTS]    

except IOError:
    sys.exit('Cannot find data!')

2619 plots imported.


__4 - Extract vocabulary__

In [25]:
# CREATE VOCABULARY OF WORDS
idx2word = ['<PAD>','<START>', '<END>']
for plot in plotsList:

    words = plot.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    for word in words:
        if word not in idx2word:
            idx2word.append(word)

word2idx = {}
for word in idx2word:
    word2idx[word] = len(word2idx)

vocabLength = len(idx2word)
print('Vocabulary Size: {}'.format(vocabLength))

Vocabulary Size: 74655


__4 - Preprocess__

In [26]:
# EXTRACT ENCODER & DECODER INPUT SENTENCES
inputSentences = []
targetSentences = []
outputSentences = []

for plot in trainPlotsList:
    words = plot.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    sentences = [words[i:i+SENTENCES_MAX_LENGTH] for i in range(0, len(words), SENTENCES_MAX_LENGTH)]
    for s in sentences:
        for i in range(1, len(s)):
            encode_tokens, decode_tokens = s[:i], s[i:]
            encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
            output_tokens = ' '.join(decode_tokens + ['<END>'])
            decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
            inputSentences.append(encode_tokens)
            targetSentences.append(decode_tokens)
            outputSentences.append(output_tokens)


numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))
stepsPerEpoch = numSamples//BATCH_SIZE
print('StepsPerEpoch: {}'.format(stepsPerEpoch))

# WRITE DATASET TO CSV
train_dataset = []

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, os.path.join(dirname, 'preprocessed/dataset_italian_{}_{}_{}_{}_{}_{}.csv'.format(
EPOCHS, 
SENTENCES_MAX_LENGTH, 
BATCH_SIZE, 
EMBEDDING_DIM,
HIDDEN_DIM,
NUM_TRAIN_PLOTS)))
if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
df = shuffle(df)
df.to_csv(filePath, index=False)

print("Done.")

Num samples: 357696
StepsPerEpoch: 5589
Creating dataset to feed Model . . . 
Done.


__5 - Data generator__

In [27]:
def data_generator(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_italian_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS))
    df = pd.read_csv(filePath)
    
    counter = 0

    while True:

        index = 0
        for idx, row in df.iterrows():
        
            if index >= batch_size:
                break

            if counter >= numSamples:
                break

            encoderTokens = row['input_encoder'].split(' ')
            decoderTokens = row['input_decoder'].split(' ')
            outputTokens = row['output_decoder'].split(' ')
            
            b = True
            while b:
                if('' in encoderTokens): encoderTokens.remove('')
                else: b = False

            b = True
            while b:
                if('' in decoderTokens): decoderTokens.remove('')
                else: b = False

            encoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderInputData = np.zeros((1, max_length + 2), dtype='int')
            decoderTargetData = np.zeros((1, max_length + 2, 1),dtype='int')
            
            for t, word in enumerate(encoderTokens):
                encoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(decoderTokens):
                decoderInputData[0, t] = word_2_idx[word]
            for t, word in enumerate(outputTokens):
                # decoderTargetData is ahead of decoderInputData by one timestep
                decoderTargetData[0, t, 0] = word_2_idx[word]
                
            df.drop(df.index[[idx]])
            index = index + 1
            counter = counter + 1
            
            yield([encoderInputData,decoderInputData], decoderTargetData)

__6 - Train the model__

In [28]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_italian_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=16,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((len(word2idx), EMBEDDING_DIM)),
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    trainGen = data_generator(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=SENTENCES_MAX_LENGTH, 
            vocab_length=vocabLength
        )

    # Train the model
    model.fit_generator(
            trainGen,
            steps_per_epoch=stepsPerEpoch,
            epochs=EPOCHS
            )

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embedding (EmbeddingRet)  [(None, None, 128),  9555840     Encoder-Input[0][0]              
                                                                 Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Embedding (TrigPosEmbed (None, None, 128)    0           Token-Embedding[0][0]            
__________

__7 - Generate sentences__

In [22]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_italian_{}_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=1,
    decoder_num=1,
    head_num=16,
    hidden_dim=HIDDEN_DIM,
    attention_activation='relu',
    feed_forward_activation='relu',
    dropout_rate=0.05,
    embed_weights=np.random.random((len(word2idx), EMBEDDING_DIM)),
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.load_weights(transformerModelPath)

sentences = [
    'Il cacciatore', 
    'Il cacciatore di taglie' , 
    'Il giovane Bobby (Dorfman) lascia il negozio della madre',
    'Una storia d\' amore' ,
    'Una storia d\' amore di una ballerina',
    'Sul treno verso'
]

decoded_sentences = []
    
for s in sentences:

    print('Generating from: {}'.format(s))
    encoderTokens = []
    s = clean(s)
    encoderwords = s.split(' ')
    
    b=True
    while b:
        if('' in encoderwords): 
            encoderwords.remove('')
        else: b = False
    
    for w in encoderwords:
        encoderTokens.append(word2idx[w])
    encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
    encoderInputData = np.zeros((1, SENTENCES_MAX_LENGTH + 2), dtype='int64')

    decoded = decode(
    model,
    encoderTokens,
    start_token=word2idx['<START>'],
    end_token=word2idx['<END>'],
    pad_token=word2idx['<PAD>'],
    max_len=SENTENCES_MAX_LENGTH,
    )

    decodedPhrase = ''
    for x in decoded:
        decodedPhrase = decodedPhrase + ' ' + idx2word[x]

    decoded_sentences.append(decodedPhrase)
    print('Generated: {}'.format(decodedPhrase))

resultsModelPatht = os.path.join(dirname, 'output_data/out_italian_{}_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM,
    NUM_TRAIN_PLOTS)
)

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPatht, index=False)

Generating from: Il cacciatore
Generated:  <START> fratelli susan e peter sono negli states , 
lucy ( henley ) ed edmund pevensie ( keynes ) si piacciono
ma non vogliono ammetterlo . grazie agli abitanti
del <END>
Generating from: Il cacciatore di taglie
Generated:  <START> . e se mostra tante
persone ragionevoli e pacifiche , non nasconde i fomentatori
del terrorismo e i predicatori di odio , che
ottengono consensi in situazioni di disagio <END>
Generating from: Il giovane Bobby (Dorfman) lascia il negozio della madre
Generated:  <START> davvero marius
 ( fresnay ) . per un equivoco lo scambia per un poco di
buono ma poi tutto si chiarisce <END>
Generating from: Una storia d' amore
Generated:  <START> «non dare la palma d’oro a un film del
partito comunista» ) . <END>
Generating from: Una storia d' amore di una ballerina
Generated:  <START> la
quale iniziano a immaginare una nuova vita , abbordando
turiste americane . inedito in sala , il film e` reperibile
in dvd con <END>
Generating