# Imports

In [66]:
import numpy as np

import pandas as pd
from pandas.io.json import json_normalize

import matplotlib.pyplot as plt

import os

import json

from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.utils as ku 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import string

import re

# Constants

In [64]:
BASE_DATA_PATH = os.path.join('fumseck', 'app', 'model')

MODEL_LOSS = 'categorical_crossentropy'
MODEL_OPTIMIZER = 'adam'
MODEL_DENSE_ACTIVATION = 'softmax'
MODEL_EMBEDDING_SIZE = 10 
MODEL_LSTM_OUTPUT_DIM = 100
MODEL_DROPOUT_RATE = 0.2

EPOCHS = 100

# Read dataset

In [3]:
def read_json(directory, filename):
    path = os.path.join(directory, filename)
    return pd.read_json(path)['quizz']

dataframes = [read_json(BASE_DATA_PATH, dataset) for dataset in os.listdir(BASE_DATA_PATH)]

In [22]:
def get_anecdotes(dataframes):
    anecdotes = []
    for i in range(0, len(dataframes)):
        df = dataframes[i]
        for level in ['debutant', 'confirme', 'expert']:
            for j in range(0, len(df[level])):
                anecdotes.append(df[level][j]['anecdote'])
    return anecdotes

anecdotes = get_anecdotes(dataframes)

# Preprocess

In [23]:
def preprocess(anecdote):
    anecdote = re.sub("'|«|»", " ", anecdote)
    translator = str.maketrans("", "", string.punctuation)
    anecdote = anecdote.translate(translator)
    anecdote = re.sub(" +", " ", anecdote)
    return anecdote.lower()

anecdotes = [preprocess(anecdote) for anecdote in anecdotes]

# Generate sequence of N-grams tokens

In [28]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    return total_words

total_words = get_sequence_of_tokens(anecdotes)
print(f"There are {total_words} tokens")

There are 2208 tokens


In [39]:
token_list = [tokenizer.texts_to_sequences([anecdote])[0] for anecdote in anecdotes]

[[6,
  5,
  243,
  1,
  2,
  244,
  1,
  5,
  502,
  1,
  503,
  245,
  504,
  15,
  142,
  26,
  143,
  1,
  505,
  19,
  100,
  506,
  3,
  507,
  508,
  101],
 [246,
  10,
  247,
  4,
  248,
  144,
  27,
  12,
  509,
  13,
  41,
  3,
  510,
  60,
  15,
  511,
  512,
  28,
  3,
  102,
  4,
  7,
  8,
  145,
  146,
  9,
  513],
 [10,
  249,
  1,
  2,
  514,
  7,
  9,
  74,
  515,
  23,
  15,
  516,
  24,
  147,
  29,
  3,
  41,
  12,
  517,
  250],
 [6,
  251,
  12,
  252,
  518,
  519,
  7,
  148,
  520,
  18,
  14,
  38,
  149,
  18,
  150,
  26,
  253,
  32,
  521,
  522],
 [523,
  4,
  524,
  7,
  3,
  151,
  525,
  526,
  1,
  103,
  527,
  17,
  9,
  254,
  528,
  4,
  529,
  17,
  530,
  531],
 [532,
  17,
  533,
  534,
  535,
  7,
  147,
  29,
  5,
  9,
  12,
  75,
  255,
  103,
  6,
  256,
  13,
  536,
  537],
 [538, 34, 539, 43, 540, 10, 541, 257, 4, 542, 42, 35, 5, 258, 1, 76, 543],
 [14, 544, 545, 7, 244, 8, 546, 28, 152, 77, 547, 1, 259, 153, 4, 548, 1, 549],
 [550,
  551,

In [37]:
tokenizer = Tokenizer()

def get_n_grams_sequence(corpus):
    
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words
    
inp_sequences, total_words = get_n_grams_sequence(anecdotes)

# Padding sequences and feature-target dataset

In [40]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = ku.to_categorical(label, num_classes = total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [44]:
print(f"Datas have shape, features : {predictors.shape}, targets: {label.shape}")

Datas have shape, features : (5023, 29), targets: (5023, 2208)


# Create model

In [55]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    
    model = Sequential()
    
    model.add(Embedding(input_dim = total_words, 
                        output_dim = MODEL_EMBEDDING_SIZE, 
                        input_length = input_len))
    model.add(LSTM(MODEL_LSTM_OUTPUT_DIM))
    model.add(Dropout(MODEL_DROPOUT_RATE))
    model.add(Dense(total_words, activation = MODEL_DENSE_ACTIVATION))

    model.compile(loss = MODEL_LOSS, optimizer = MODEL_OPTIMIZER)
    
    return model

model = create_model(max_sequence_len, total_words)

model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 29, 10)            22080     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 2208)              223008    
Total params: 289,488
Trainable params: 289,488
Non-trainable params: 0
_________________________________________________________________


# Model callbacks

In [72]:
model_ckpt_cb = ModelCheckpoint('text_gen_model.h5', monitor = 'loss', mode = 'min', save_best_only = True)
es_cb = EarlyStopping(monitor = 'loss', mode = 'min', verbose = 1, patience = 10)
callbacks = [model_ckpt_cb, es_cb]

# Fit model

In [73]:
hist = model.fit(predictors, label, epochs = EPOCHS, verbose = 1, callbacks = callbacks, shuffle = True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

KeyboardInterrupt: 

# Plot training history

In [None]:
def plot_training_hist(hist):
    
    plt.subplot(211)  
    plt.plot(hist.history['acc'])  
    plt.plot(hist.history['val_acc'])  
    plt.title('model accuracy')  
    plt.ylabel('accuracy')  
    plt.xlabel('epoch')  
    plt.legend(['train', 'valid']) 

    plt.subplot(212)  
    plt.plot(hist.history['loss'])  
    plt.plot(hist.history['val_loss'])  
    plt.title('model loss')  
    plt.ylabel('loss')  
    plt.xlabel('epoch')  
    plt.legend(['train', 'valid'])
    
plot_training_hist(hist)

# Generate text

In [57]:
def generate_text(seed_text, model, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen = max_sequence_len - 1, padding = 'pre')
    predicted = model.predict_classes(token_list, verbose = 0)
    print(predicted)
    output_word = ""
    for word,index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
return seed_text.title()

In [None]:
print(generate_text("united states", 5, model, max_sequence_len))