# Imports

In [1]:
import numpy as np

import pandas as pd
from pandas.io.json import json_normalize

import matplotlib.pyplot as plt

from collections import Counter

import os

import json

from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.utils as ku 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import string

import re

import pickle

# Constants

In [2]:
BASE_DATA_PATH = os.path.join('..', 'app', 'model')

MODEL_LOSS = 'categorical_crossentropy'
MODEL_OPTIMIZER = 'adam'
MODEL_DENSE_ACTIVATION = 'softmax'
MODEL_EMBEDDING_SIZE = 8
MODEL_LSTM_OUTPUT_DIM = 32
MODEL_DROPOUT_RATE = 0.2
MODEL_NAME = 'text_gen_model.h5'

VOCAB_FILE_NAME = 'anecedotes_vocab.json'

EPOCHS = 300

# Seeds

In [3]:
def seed_everything(seed_value = 0):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    np.random.seed(seed_value)
    
seed_everything()

# Read dataset

In [14]:
def read_json(directory, filename):
    path = os.path.join(directory, filename)
    return pd.read_json(path)['quizz']

dataframes = [read_json(BASE_DATA_PATH, dataset) for dataset in os.listdir(BASE_DATA_PATH)]

In [16]:
def get_anecdotes(dataframes):
    anecdotes = []
    for i in range(0, len(dataframes)):
        df = dataframes[i]
        for level in ['debutant', 'confirme', 'expert']:
            for j in range(0, len(df[level])):
                anecdotes.append(df[level][j]['anecdote'])
    return anecdotes

anecdotes = get_anecdotes(dataframes)

# Preprocess

In [17]:
def get_most_frequents(anecdotes):
    word_list = [sentence.split(" ") for sentence in anecdotes]
    most_freq_words = Counter([item for sub_item in word_list for item in sub_item]).most_common()[:200]
    return [most_freq_word[0] for most_freq_word in most_freq_words]

def remove_most_freq_words(anecdote, most_freq_words):
    return " ".join([word for word in anecdote.split(" ") if word not in most_freq_words])
    

def preprocess(anecdote):
    anecdote = re.sub("'|«|»", " ", anecdote)
    translator = str.maketrans("", "", string.punctuation)
    anecdote = anecdote.translate(translator)
    anecdote = re.sub(" +", " ", anecdote)
    return anecdote.lower()

anecdotes = [preprocess(anecdote) for anecdote in anecdotes]
most_freq_words = get_most_frequents(anecdotes)
anecdotes = [remove_most_freq_words(anecdote, most_freq_words) for anecdote in anecdotes]

# Generate sequence of N-grams tokens

In [18]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    return total_words

total_words = get_sequence_of_tokens(anecdotes)
print(f"There are {total_words} tokens")

There are 5816 tokens


In [19]:
token_list = [tokenizer.texts_to_sequences([anecdote])[0] for anecdote in anecdotes]

In [20]:
tokenizer = Tokenizer()

def get_n_grams_sequence(corpus):
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
     
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
    
inp_sequences, total_words = get_n_grams_sequence(anecdotes)

# Padding sequences and feature-target dataset

In [21]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_len

predictors, labels, max_sequence_len = generate_padded_sequences(inp_sequences)

In [22]:
print(f"Datas have shape, features : {predictors.shape}, targets: {labels.shape}")

Datas have shape, features : (8158, 17), targets: (8158, 5816)


# Split in train-validation set

In [23]:
x_train, x_val, y_train, y_val = train_test_split(predictors, labels, test_size = 0.2, shuffle = True)

# Create model

In [24]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    
    model = Sequential()
    
    model.add(Embedding(input_dim = total_words, 
                        output_dim = MODEL_EMBEDDING_SIZE, 
                        input_length = input_len))
    model.add(LSTM(MODEL_LSTM_OUTPUT_DIM))
    model.add(Dropout(MODEL_DROPOUT_RATE))
    model.add(Dense(total_words, activation = MODEL_DENSE_ACTIVATION))

    model.compile(loss = MODEL_LOSS, optimizer = MODEL_OPTIMIZER)
    
    return model

model = create_model(max_sequence_len, total_words)

model.summary()

W0903 10:54:03.690346 26948 deprecation.py:506] From C:\Users\jonas.freiburg\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\keras\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0903 10:54:03.831347 26948 deprecation.py:506] From C:\Users\jonas.freiburg\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17, 8)             46528     
_________________________________________________________________
lstm (LSTM)                  (None, 32)                5248      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 5816)              191928    
Total params: 243,704
Trainable params: 243,704
Non-trainable params: 0
_________________________________________________________________


# Model callbacks

In [25]:
model_ckpt_cb = ModelCheckpoint(MODEL_NAME, monitor = 'val_loss', mode = 'min', save_best_only = True)
es_cb = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 10)
callbacks = [model_ckpt_cb, es_cb]

# Fit model

In [26]:
hist = model.fit(predictors, 
                 labels,
                 validation_data = (x_val, y_val),
                 epochs = EPOCHS, 
                 verbose = 1, 
                 callbacks = callbacks, 
                 shuffle = True)

Train on 8158 samples, validate on 1632 samples


W0903 10:54:10.961126 26948 deprecation.py:323] From C:\Users\jonas.freiburg\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300


Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300


Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 

Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 

# Plot training history

In [27]:
%matplotlib notebook 

def plot_training_hist(hist): 
    plt.plot(hist.history['loss'])  
    plt.plot(hist.history['val_loss'])
    plt.title('model loss')  
    plt.ylabel('loss')  
    plt.xlabel('epoch')  
    plt.legend(['train', 'valid'])
    
plot_training_hist(hist)

<IPython.core.display.Javascript object>

# Generate text

In [28]:
def generate_text(seed_text, model, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen = max_sequence_len - 1, padding = 'pre')
    predicted = model.predict(token_list, verbose = 0)
    predicted = predicted.flatten()
    predicted = np.random.choice(len(predicted), 3, p = predicted)
    output_words = []
    for word, index in tokenizer.word_index.items():
        if index in predicted:
            output_words.append(word)
    return output_words

generate_text("Le c est un language", model, 18)

['accueillir', 'ludique', 'poser']

# Save vocabulary as json

In [29]:
def save_vocab_as_json(filename, words):
    with open(filename, 'w') as json_file:
          json.dump(words, json_file)
            
save_vocab_as_json(VOCAB_FILE_NAME, tokenizer.word_docs)

# Save both tokenizers

In [30]:
with open('first_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
with open('second_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)