In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
sentences = []
with open("../data/train_data.txt", "r", encoding="utf-8") as file:
    sentences = file.readlines()

## Tokenize

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

vocab_size = len(tokenizer.word_index) + 1 # + 1 to consider oov
print("Vocab size:", vocab_size)

Vocab size: 4040


## Convert sentences to sequences

In [8]:
sequences = tokenizer.texts_to_sequences(sentences)

## Generate dataset

In [20]:
# Generate input sequences by taking the first 1, 2, 3, ..., n tokens from each line/sequence

input_sequences = []
for line in sentences:
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        n_gram_sequence = tokens[:i+1]
        input_sequences.append(n_gram_sequence)

# Now, pad sequences so they all have the same length
# First, get length of the longest sequence
max_length = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_length, padding='pre'))

# Now, for each sequence, we take the last token (word) as the label, and everything previous to it as the sample
labels = input_sequences[:,-1]
X = input_sequences[:,:-1]


## One-hot encode labels

In [21]:
labels = tf.keras.utils.to_categorical(labels, num_classes = vocab_size)

## Generate training, testing and validation datasets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = 0.2, shuffle = True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

## Create model

In [26]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length = max_length - 1), # We remove 1 to max_length because one token is taken from all sequences and used as label
    
    # First convolutional layer
    tf.keras.layers.Conv1D(filters = 32, kernel_size = 5),
    tf.keras.layers.MaxPooling1D(pool_size = 2),

    # LSTM layers
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(62, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),

    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(vocab_size, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 24, 64)            258560    
                                                                 
 conv1d_2 (Conv1D)           (None, 20, 32)            10272     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 10, 32)            0         
 g1D)                                                            
                                                                 
 bidirectional_4 (Bidirecti  (None, 10, 124)           47120     
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 128)               96768     
 onal)                                                           
                                                      

## Fit model

In [27]:
history = model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500

KeyboardInterrupt: 