## 1. Import dependencies 
Importing needed dependencies.
In this first step we also define all global variables that will help managing redundancy:
- __*PREPROCESS*__: preprocessing type (Continous or splitted on dots)
- __*EPOCHS*__: number of epochs in which the training is divided.
- __*SENTENCES_MAX_LENGTH*__: Maximum length of the variable dimension phrases..
- __*BATCH_SIZE*__: number of samples after which update the weights.
- __*EMBEDDING_DIM*__: number of neurons in the Embeddings layer.
- __*HIDDEN_DIM*__: number of LSTM units in the network.
- __*ENCODERS*__: number of encoders in the architecture.
- __*DECODERS*__: number of decoders in the architecture.
- __*DROPOUT_RATE*__: Dropout value.
- __*HEADS_ATTENTION*__: number of words considered by the self-attention mechanism.
- __*ACTIVATION_FUNCTION*__: Used by the feedforward layers in the transformer model.

In [1]:
import tensorflow as tf
import keras
import os
import sys
import re
import numpy as np
import pandas as pd
import random
import gensim
from gensim.models import Word2Vec
from sklearn.utils import shuffle
from keras_transformer import get_model, decode
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from keras.models import load_model

EPOCHS = 100
SENTENCES_MAX_LENGTH = 65
BATCH_SIZE = 64
EMBEDDING_DIM = 128
HIDDEN_DIM = 1024
ENCODERS = 6
DECODERS = 6
DROPOUT_RATE = 0.1
HEADS_ATTENTION = 8
ACTIVATION_FUNCTION = 'relu'

Using TensorFlow backend.


## 2. Import international news
The data available on Kaggle was prepared in Jan 2019, so the data is very new and has got a lot of technological updates so while preprocessing please take care of all the new terms.

To get more information click on <a href="https://www.kaggle.com/kevintoms/news-data">link to the Kaggle website</a> .

 

In [2]:
def clean(text):
    '''
    '''
    text = text.lower()
    text = text.replace("ain't", "am not")
    text = text.replace("aren't", "are not")
    text = text.replace("can't", "cannot")
    text = text.replace("can't've", "cannot have")
    text = text.replace("'cause", "because")
    text = text.replace("could've", "could have")
    text = text.replace("couldn't", "could not")
    text = text.replace("couldn't've", "could not have")
    text = text.replace("should've", "should have")
    text = text.replace("should't", "should not")
    text = text.replace("should't've", "should not have")
    text = text.replace("would've", "would have")
    text = text.replace("would't", "would not")
    text = text.replace("would't've", "would not have")
    text = text.replace("didn't", "did not")
    text = text.replace("doesn't", "does not")
    text = text.replace("don't", "do not")
    text = text.replace("hadn't", "had not")
    text = text.replace("hadn't've", "had not have")
    text = text.replace("hasn't", "has not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd", "he would")
    text = text.replace("haven't", "have not")
    text = text.replace("he'd've", "he would have")
    text = text.replace("'s", "")
    text = text.replace("'t", "")
    text = text.replace("'ve", "")
    text = text.replace(".", " . ")
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")
    text = text.replace(";", " ; ")
    text = text.replace(":", " : ")
    text = text.replace(",", " , ")
    text = text.replace("´", "")
    text = text.replace("‘", "")
    text = text.replace("’", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("\'", "")
    text = text.replace("\"", "")
    text = text.replace("-", "")
    text = text.replace("–", "")
    text = text.replace("—", "")
    text = text.replace("[", "")
    text = text.replace("]","")
    text = text.replace("{","")
    text = text.replace("}", "")
    text = text.replace("/", "")
    text = text.replace("|", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("$", "")
    text = text.replace("+", "")
    text = text.replace("*", "")
    text = text.replace("%", "")
    text = text.replace("#", "")
    text = ''.join([i for i in text if not i.isdigit()])

    return text

try:
    dirname = os.path.abspath('')
    filepath = os.path.join(dirname, 'input_data/international_news.csv')
    dataframe = pd.read_csv(filepath, sep=',')
    newsList = dataframe['content']

    trainNewsList = []
    for news in newsList:
        cleaned = clean(news)
        words = cleaned.split(' ')
        if len(words) < 65 : trainNewsList.append(cleaned)
        
    print('{} news imported and cleaned.'.format(len(trainNewsList)))

except IOError:
    sys.exit('Cannot find data!')

940 news imported and cleaned.


We need to investigate on news max length and average length to better decided preprocess hyperparamateres.

In [3]:
sumLen = 0
maxLen = 0

for n in trainNewsList:
    words = n.split(' ')
    l = len(words)
    sumLen += l
    if l > maxLen : maxLen = l 
    
avgLen = sumLen/len(trainNewsList)
print('Number of reviews: {}'.format(len(trainNewsList)))
print('Max length: {}'.format(maxLen))
print('Avg length: {}'.format(avgLen))  

Number of reviews: 940
Max length: 64
Avg length: 61.87978723404255


## 3. Extract Vocabulary
The vocabulary is saved as: 
- a __numpy array__ to map each encoding to the right word
- a __dictionary__ to map each word to its encoding number 

In [4]:
# CREATE VOCABULARY OF WORDS
idx2word = []
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}

for news in trainNewsList:
    words = news.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    for word in words:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

for word in idx2word:
    word2idx[word] = len(word2idx)

idx2word = list(word2idx.keys())
print(idx2word[:3])

vocabLength = len(idx2word)
print('Vocabulary Size: {}'.format(vocabLength))

['<PAD>', '<START>', '<END>']
Vocabulary Size: 8805


## 4. Preprocess text

The Transoformer model has an Encoder-Decoder architecture so we can train the model to generate variable dimension sequences, meaning that it will be the model itself to decide how many words have to be generated for a determined input sequence.
However in order to achieve this result the text has to preprocessed in a way that let the model understand where a sequence starts and where it ends.
In fact in the previous code cell we had these three tokens to the vocabulary:

```python
word2idx = {'<PAD>' : 0, '<START>' : 1 , '<END>': 2}
```
News are divided into sequences of words, respecting a maximum length decided a priori. Each sequence will generate as many samples as its number of words.

For example, say SEQUENCES_LENGTH is 4 and our text is "Hello my name is Dario and I love to code". 
- Sequences: "Hello my name is ", "Dario and I love", "to code"

Then with the first sequence:
- __EncoderInput__: "START Hello END" <br/>
  __DecoderInput__: "START my name is END" <br/>
  __Target__: "my name is END" <br/>
  
  
- __EncoderInput__: "START Hello my END" <br/>
  __DecoderInput__: "START name is END"<br/>
  __Target__: "name is END"<br/>
  
  
- __EncoderInput__:  "START Hello my name END"<br/>
  __DecoderInput__: "START is END"<br/>
  __Target__: "is END"<br/>
  
  
- __EncoderInput__: "START Hello my name is END" <br/>
  __DecoderInput__: "START END"<br/>
  __Target__: "END"<br/>

In [5]:
def createInputTarget(words) :
    
    encoder = []
    decoder = []
    output = []
    
    for i in range(1, len(words)):
        encode_tokens, decode_tokens = words[:i], words[i:]
        encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
        output_tokens = ' '.join(decode_tokens + ['<END>'])
        decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
        encoder.append(encode_tokens)
        decoder.append(decode_tokens)
        output.append(output_tokens)
        
    return encoder, decoder, output

def getWordTokens(sentence):
    #clean tokens
    words = sentence.split(' ')
    words.append('.')
    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False
    
    return words

def checkMaxLength(words):
    
    seq = []
    
    if len(words) > SENTENCES_MAX_LENGTH :
        seq.append(words[:SENTENCES_MAX_LENGTH])
        seq.append(words[SENTENCES_MAX_LENGTH:])
        while len(seq[-1]) > SENTENCES_MAX_LENGTH:
            tmp = seq[-1]
            seq[-1] = tmp[:SENTENCES_MAX_LENGTH]
            seq.append(tmp[SENTENCES_MAX_LENGTH:])
    else : 
        seq.append(words)

    return seq

# EXTRACT ENCODER & DECODER INPUT SENTENCES
inputSentences = []
targetSentences = []
outputSentences = []

for news in trainNewsList :
    words = news.split(' ')

    b=True
    while b:
        if('' in words): 
            words.remove('')
        else: b = False

    sentences = [words[i:i+SENTENCES_MAX_LENGTH] for i in range(0, len(words), SENTENCES_MAX_LENGTH)]
    for s in sentences:
        for i in range(1, len(s)):
            encode_tokens, decode_tokens = s[:i], s[i:]
            encode_tokens = ' '.join(['<START>'] + encode_tokens + ['<END>'])
            output_tokens = ' '.join(decode_tokens + ['<END>'])
            decode_tokens = ' '.join(['<START>'] + decode_tokens + ['<END>'])
            inputSentences.append(encode_tokens)
            targetSentences.append(decode_tokens)
            outputSentences.append(output_tokens)


numSamples = len(inputSentences)
print('Num samples: {}'.format(numSamples))
stepsPerEpoch = numSamples//BATCH_SIZE
print('StepsPerEpoch: {}'.format(stepsPerEpoch))

# WRITE DATASET TO TXT  
train_dataset = []

print("Creating dataset to feed Model . . . ")
dirname = os.path.abspath('')
filePath = os.path.join(dirname, 'preprocessed/dataset_news_{}_{}_{}_{}_{}.csv'.format(
EPOCHS, 
SENTENCES_MAX_LENGTH, 
BATCH_SIZE, 
EMBEDDING_DIM,
HIDDEN_DIM))
if os.path.exists(filePath):
    os.remove(filePath) 

d= {'input_encoder' : inputSentences, 'input_decoder' :targetSentences, 'output_decoder':outputSentences }
df = pd.DataFrame(data=d) 
df = shuffle(df)
df.to_csv(filePath, index=False)

print("Dataset printed on CSV.")

Num samples: 52913
StepsPerEpoch: 826
Creating dataset to feed Model . . . 
Dataset printed on CSV.


But what is the purpose of the padding token?
```python
'<PAD>' : 0
```

In order to be able to feed the model we need to create inputs of the same length.
This is way I defined a function to generate final data with paddings.

In [6]:
def generate_data(word_2_idx, num_samples, max_length, vocab_length, batch_size=BATCH_SIZE):
    '''
    '''
    dirname = os.path.abspath('')
    filePath = os.path.join(dirname, 'preprocessed/dataset_news_{}_{}_{}_{}_{}.csv'.format(
        EPOCHS, 
        SENTENCES_MAX_LENGTH, 
        BATCH_SIZE, 
        EMBEDDING_DIM,
        HIDDEN_DIM))
    df = pd.read_csv(filePath)
    
    encoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderInputData = np.zeros((numSamples, max_length + 2), dtype='int')
    decoderTargetData = np.zeros((numSamples, max_length + 2, 1),dtype='int')
    
    for i in range(0, numSamples):
        if(i%10000 == 0):print('Generating feeding data... {}/{}'.format(i,numSamples))    
        encoderTokens = df.iloc[[i]]['input_encoder'].values[0].split(' ')
        decoderTokens = df.iloc[[i]]['input_decoder'].values[0].split(' ')
        outputTokens = df.iloc[[i]]['output_decoder'].values[0].split(' ')

        for t, word in enumerate(encoderTokens):
            encoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(decoderTokens):
            decoderInputData[i, t] = word_2_idx[word]
        for t, word in enumerate(outputTokens):
            # decoderTargetData is ahead of decoderInputData by one timestep
            decoderTargetData[i, t, 0] = word_2_idx[word]

    
    return encoderInputData, decoderInputData, decoderTargetData

## 5. Extract embeddings matrix
Loading pre-trained embeddings is a good practice to use them and in this case I calculated them with Google's Word2Vec model on the famous text8 dataset.
- *More details on __train_embeddings.ipyn__ notebook* (To be executed if the .bin file do not exists)

The embeddings are simply 128 (or whatever is the dimensionality during training) weigths from a single neuron in the input layer to the 128 neurons in the hidden layer trained to understand which words compared in the same context for a given text.

So we simply extract these weights for every single word in our vocabulary and build a matrix with them.

In [7]:
# Recreating embeddings index based on Tokenizer vocabulary
word2vecModel = gensim.models.Word2Vec.load('embeddings/news_word2vec_skipgram_128.bin')
word2vec_vocabulary = word2vecModel.wv.vocab
embeddingIndex = dict()
counter = 0
for i, word in enumerate(idx2word):
    if word in word2vec_vocabulary :
        embeddingIndex[word] = word2vecModel[word]
    else:
        counter += 1

print("{} words without pre-trained embedding!".format(counter))
    
# Prepare embeddings matrix
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))
for i, word in enumerate(idx2word):
    embeddingVector = embeddingIndex.get(word)
    if embeddingVector is not None:
        embeddingMatrix[i] = embeddingVector

3 words without pre-trained embedding!


  


### _Or it is possible to use random weights_
Do not execute this cell to use pre-trained embeddings.

In [None]:
embeddingMatrix = np.random.random((len(word2idx), EMBEDDING_DIM))

## 7. Train the model
To build the transformer model I use and external library available on <a href="https://github.com/kpot/keras-transformer">this GitHub repository</a>.
The the model is trained and its weight are saved in a .h5 file.

In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_news_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=ENCODERS,
    decoder_num=DECODERS,
    head_num=HEADS_ATTENTION,
    hidden_dim=HIDDEN_DIM,
    attention_activation=ACTIVATION_FUNCTION,
    feed_forward_activation=ACTIVATION_FUNCTION,
    dropout_rate=DROPOUT_RATE,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.summary()

if not os.path.exists(transformerModelPath):

    encoderInputData, decoderInputData, decoderTargetData = generate_data(
            word_2_idx=word2idx,
            num_samples=numSamples,
            max_length=SENTENCES_MAX_LENGTH, 
            vocab_length=vocabLength
    )

    # Train the model
    model.fit(
            [encoderInputData, decoderInputData],
            decoderTargetData,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS
            )

    model.save_weights(transformerModelPath) 

else : 
    print('Model already trained')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Token-Embedding (EmbeddingRet)  [(None, None, 128),  1127040     Encoder-Input[0][0]              
                                                                 Decoder-Input[0][0]              
_____________________

Generating feeding data... 0/52913
Generating feeding data... 10000/52913
Generating feeding data... 20000/52913
Generating feeding data... 30000/52913


## 8. Generate text
To conclude, here the prediction script, which will use the decode function from the open source library to predict the next word again and again



In [None]:
dirname = os.path.abspath('')

transformerModelPath = os.path.join(dirname, 'models/tr_news_{}_{}_{}_{}_{}.h5'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)

# Build the model
model = get_model(
    token_num=len(word2idx),
    embed_dim=EMBEDDING_DIM,
    encoder_num=ENCODERS,
    decoder_num=DECODERS,
    head_num=HEADS_ATTENTION,
    hidden_dim=HIDDEN_DIM,
    attention_activation=ACTIVATION_FUNCTION,
    feed_forward_activation=ACTIVATION_FUNCTION,
    dropout_rate=DROPOUT_RATE,
    embed_weights=embeddingMatrix,
)

model.compile(
    optimizer= keras.optimizers.Adam(),
    loss= keras.losses.sparse_categorical_crossentropy,
    metrics={},
    # Note: There is a bug in keras versions 2.2.3 and 2.2.4 which causes "Incompatible shapes" error, if any type of accuracy metric is used along with sparse_categorical_crossentropy. Use keras<=2.2.2 to use get validation accuracy.
)

model.load_weights(transformerModelPath)

sentences = [
    'Facebook CEO said privacy is not so important',
    'Whatsapp users have complained that their privacy is not important for the company after', 
    'Police arrested a man after a woman claimed he had been following her since Friday. ' , 
    'China developed new social network and the app is ready for release. Citizens will identify others with a smartphone camera.',
]

decoded_sentences = []
    
for s in sentences:

    print('Generating from: {}'.format(s))
    encoderTokens = []
    s = clean(s)
    encoderwords = s.split(' ')
    
    b=True
    while b:
        if('' in encoderwords): 
            encoderwords.remove('')
        else: b = False
    
    for w in encoderwords:
        encoderTokens.append(word2idx[w])
    encoderTokens = [word2idx['<START>']] + encoderTokens + [word2idx['<END>']]
    encoderInputData = np.zeros((1, SENTENCES_MAX_LENGTH + 2), dtype='int64')

    decoded = decode(
    model,
    encoderTokens,
    start_token=word2idx['<START>'],
    end_token=word2idx['<END>'],
    pad_token=word2idx['<PAD>'],
    max_len=SENTENCES_MAX_LENGTH,
    )

    decodedPhrase = ''
    for x in decoded:
        decodedPhrase = decodedPhrase + ' ' + idx2word[x]

    decoded_sentences.append(decodedPhrase)
    print('Generated: {}'.format(decodedPhrase))

resultsModelPath = os.path.join(dirname, 'output_data/out_fable_{}_{}_{}_{}_{}.csv'.format(
    EPOCHS, 
    SENTENCES_MAX_LENGTH, 
    BATCH_SIZE, 
    EMBEDDING_DIM,
    HIDDEN_DIM)
)    

dict ={
    'phrase' : sentences,
    'generated' : decoded_sentences
}
sentiment_df = pd.DataFrame.from_dict(dict)
sentiment_df.to_csv(resultsModelPath, index=False)