# 1. Text cleaning

### Import libraries and create utilitary functions

In [4]:
#import sys

In [9]:
#sys.executable

In [10]:
#!~/anaconda3/bin/python -m pip install spacy

In [13]:
#!python -m spacy download fr_core_news_sm

In [18]:
#!~/anaconda3/bin/python -m pip install tensorflow

In [1]:
import pandas as pd
import re
import string
import os
import random
import spacy

In [2]:
import tensorflow as tf

In [3]:
# Set up path for the text files

path_parent = os.path.dirname(os.getcwd())
source_dir = os.path.join(path_parent, "text_files")

In [4]:
def remove_space(text):
    text = text.strip()
    text = text.split()
    return " ".join(text)

def show_random_text(sourceDir):
    rand_text = random.choice(os.listdir(sourceDir))
    
    while rand_text[0] == ".":
        rand_text = random.choice(os.listdir(sourceDir))

    with open(os.path.join(source_dir, rand_text), "r") as f:
            data = f.read()
    print(data)
    
def read_single_text(sourceDir):
    rand_text = random.choice(os.listdir(sourceDir))
    
    while rand_text[0] == ".":
        rand_text = random.choice(os.listdir(sourceDir))

    with open(os.path.join(source_dir, rand_text), "r") as f:
            data = f.read()
    return data

In [5]:
data = read_single_text(source_dir)

In [6]:
data

"APPEL D’OFFRES NO DOCUMENT D’APPEL D’OFFRES (Sans contrat intégré) Page 2 de 3 Contrat Biens (Fournitures) Version détaillée 2019-12-20 «Denom_Soc_D_O» APPEL D’OFFRES NO «AO_No» «AO_Titre» Documentation d’appel d’offres APPEL D’OFFRES NO DOCUMENTATION D’APPELS D’OFFRES BIENS (FOURNITURES) CONTRAT Version détaillée (2019-12-20) . TABLE DES MATIÈRES PAGE PRÉAMBULE 9 0.00 INTERPRÉTATION 10 0.01 Terminologie 10 0.01.01 Appel d'Offres [Essentielle] 10 0.01.02 Avis d'Adjudication [Essentielle] 10 0.01.03 Bien [Importante] 10 0.01.04 Bien en Consignation [Facultative] 10 0.01.05 Bon de Commande [Essentielle] 10 0.01.06 Bordereau de Prix [Essentielle] 10 0.01.07 Changement de Contrôle [Essentielle] 10 0.01.08 Charge [Essentielle] 11 0.01.09 Contrat [Essentielle] 11 0.01.10 Devis [Essentielle] 11 0.01.11 Documents d'Appel d'Offres [Essentielle] 11 0.01.12 Établissement Participant [Essentielle] 11 0.01.13 Formulaire de Soumission [Essentielle] 11 0.01.14 Institution Financière [Essentielle] 12

### Clean text and transform into tokens

In [8]:
def clean_text(doc):
    tokens = doc.split()
    table = str.maketrans('','',string.punctuation)        # make a translation table
    tokens = [(w.translate(table)) for w in tokens]        # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]   # remove non-alphanumeric special characters
    tokens = [word.lower() for word in tokens]             # convert into lowercase letters
    tokens = [word for word in tokens if word != 'essentielle']
    tokens = [word for word in tokens if word != 'importante']
    tokens = [word for word in tokens if word != 'facultative']
    return tokens

def clean_text_properly(doc):
    tokens = doc.split()
    tokens = [word for word in tokens if word != '[Essentielle]']
    tokens = [word for word in tokens if word != '[Importante]']
    tokens = [word for word in tokens if word != '[Facultative]']
    table = str.maketrans('','',string.punctuation)        # make a translation table
    tokens = [(w.translate(table)) for w in tokens]        # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]   # remove non-alphanumeric special characters
    tokens = [word.lower() for word in tokens]             # convert into lowercase letters
    return tokens

In [9]:
tokens = clean_text_properly(data)

In [10]:
print(tokens[:500])

['appel', 'no', 'document', 'sans', 'contrat', 'intégré', 'page', 'de', 'contrat', 'biens', 'fournitures', 'version', 'détaillée', 'appel', 'no', 'documentation', 'appel', 'no', 'documentation', 'biens', 'fournitures', 'contrat', 'version', 'détaillée', 'table', 'des', 'matières', 'page', 'préambule', 'interprétation', 'terminologie', 'appel', 'doffres', 'avis', 'dadjudication', 'bien', 'bien', 'en', 'consignation', 'bon', 'de', 'commande', 'bordereau', 'de', 'prix', 'changement', 'de', 'contrôle', 'charge', 'contrat', 'devis', 'documents', 'dappel', 'doffres', 'établissement', 'participant', 'formulaire', 'de', 'soumission', 'institution', 'financière', 'organisme', 'public', 'partie', 'personne', 'personne', 'liée', 'renseignement', 'confidentiel', 'renseignement', 'personnel', 'représentants', 'légaux', 'soumission', 'primauté', 'droit', 'applicable', 'généralités', 'dates', 'et', 'délais', 'a', 'de', 'rigueur', 'b', 'calcul', 'références', 'financières', 'consentement', 'objet', 'c

In [11]:
len(tokens)

13093

In [12]:
len(set(tokens))  # number of unique words

1810

# 2. Create text sequences 

The number of tokens in a sequence depends on the nature of the document. For legal documents, we'll start with 25 words that should be enough to capture the context of the sentence. 

In [13]:
train_len = 25+1    # 25 training words and one target word

# Make an empty list of sequences
lines = []

for i in range(train_len, len(tokens)):   # from 0 to 25
    seq = tokens[i-train_len:i]           # create a sequence of 26 tokens 
    line = ' '.join(seq)                  # join tokens to create a line
    lines.append(line)
    if i > 100000:                        # take 100k words to train the model, to reduce the resources
        break

In [14]:
len(lines)

13067

In [15]:
lines[0]

'appel no document sans contrat intégré page de contrat biens fournitures version détaillée appel no documentation appel no documentation biens fournitures contrat version détaillée table des'

In [16]:
tokens[0], tokens[25]

('appel', 'des')

In [17]:
lines[1]

'no document sans contrat intégré page de contrat biens fournitures version détaillée appel no documentation appel no documentation biens fournitures contrat version détaillée table des matières'

In [18]:
tokens[1], tokens[26]

('no', 'matières')

In [19]:
import random
random.seed(101)
random_pick = random.randint(0,len(lines))

In [20]:
lines[random_pick]

'en vertu des présentes est suffisant est consigné dans un écrit et expédié par un mode de communication qui permet à la partie expéditrice de prouver'

# 3. Tokenization: convert text sequences to number sequences

In [21]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
# Create a Tokenizer object, fit it on the text, and transform to numerical sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) 
sequences = tokenizer.texts_to_sequences(lines) 

In [23]:
# Each of these numbers is an id for a particular word

sequences[0]

[257,
 361,
 168,
 46,
 11,
 1810,
 694,
 1,
 11,
 20,
 1052,
 1051,
 695,
 257,
 361,
 295,
 257,
 361,
 295,
 20,
 1052,
 11,
 1051,
 695,
 1808,
 13]

In [24]:
tokenizer.index_word[25]

'qui'

In [25]:
for i in sequences[25]:
    print(f'{i} : {tokenizer.index_word[i]}')

13 : des
426 : matières
694 : page
696 : préambule
697 : interprétation
698 : terminologie
257 : appel
87 : doffres
47 : avis
427 : dadjudication
30 : bien
30 : bien
10 : en
40 : consignation
121 : bon
1 : de
78 : commande
227 : bordereau
1 : de
33 : prix
143 : changement
1 : de
151 : contrôle
296 : charge
11 : contrat
170 : devis


In [45]:
# Tokenizer can give a dictionary of all word counts

#tokenizer.word_counts

In [26]:
# Vocabulary size

vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

1810

### Convert to Numpy Matrix

In [27]:
# 'sequences' is a list of lists, we can convert it to numpy matrix
sequences = np.array(sequences)

In [28]:
sequences

array([[ 257,  361,  168, ...,  695, 1808,   13],
       [ 361,  168,   46, ..., 1808,   13,  426],
       [ 168,   46,   11, ...,   13,  426,  694],
       ...,
       [ 600,   70,   28, ...,    1,   73, 1809],
       [  70,   28,    6, ...,   73, 1809,   11],
       [  28,    6,  256, ..., 1809,   11,   39]])

# 4. Feature / label split (create x and y)

Predict the last word in a sequence

In [29]:
# First 25 words (compare to 'sequences' : it's everything without the last index)
sequences[:,:-1]

array([[ 257,  361,  168, ..., 1051,  695, 1808],
       [ 361,  168,   46, ...,  695, 1808,   13],
       [ 168,   46,   11, ..., 1808,   13,  426],
       ...,
       [ 600,   70,   28, ...,  694,    1,   73],
       [  70,   28,    6, ...,    1,   73, 1809],
       [  28,    6,  256, ...,   73, 1809,   11]])

In [30]:
# last word
sequences[:,-1]

array([  13,  426,  694, ..., 1809,   11,   39])

In [31]:
x, y = sequences[:,:-1], sequences[:,-1]

In [32]:
x[0].shape

(25,)

In [33]:
x.shape # the number of sequences for the sliding window of 25 words

(13067, 25)

In [34]:
y.shape # a column of last words for each sequence

(13067,)

In [35]:
# convert those words to one-hot matrix
y = to_categorical(y, num_classes=vocabulary_size+1)
y.shape

(13067, 1811)

In [36]:
seq_len = x.shape[1]
seq_len

25

# 5. Building an LSTM model

In [37]:
# PARAMETERS CHOICE

# Activation = RELU
# The size of the output layer is 'vocabulary_size'
# Loss = 'categorical_crossentropy'

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    # Embedding turns positive integers(indexes) into dense vectors of fixed size (see docs).
    
    # First LSTM layer
    model.add(Embedding(input_dim=vocabulary_size, output_dim=25, input_length=seq_len)) 
    
    # First LSTM layer
    model.add(LSTM(units=100, return_sequences=True)) # better to take multiples of seq_len; smalle batches => faster
    
    # Second LSTM layer
    model.add(LSTM(100))
    
    # Dense layer
    model.add(Dense(100, activation='relu'))
    
    # Final layer
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [38]:
# define model
model = create_model(vocabulary_size+1, seq_len) # +1 for Embeddings

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            45275     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 100)           50400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 1811)              182911    
Total params: 369,086
Trainable params: 369,086
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Train the model

model.fit(x, y, batch_size=256, epochs=200,verbose=1) # epochs: at least > 200

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<tensorflow.python.keras.callbacks.History at 0x7f29b0d4d590>

# 6. Generating New Text

In [40]:
from random import randint
from pickle import dump, load
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [41]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : the model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by the model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    # this is a sentence to add a word and chop off the first word
    # and repet as many times as the number of words the user wants to generate
    input_text = seed_text  
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the first token (number)
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate 
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] # [0] returns index 
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Get a random seed sequence

In [67]:
#lines[500]

In [42]:
import random
random.seed(101)
random_pick = random.randint(0,len(lines))

In [43]:
random_seed_text = lines[random_pick]
random_seed_text

'en vertu des présentes est suffisant est consigné dans un écrit et expédié par un mode de communication qui permet à la partie expéditrice de prouver'

### Generate new text

In [44]:
generate_text(model,tokenizer,seq_len,seed_text=random_seed_text,num_gen_words=20)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'défaut naissance de commande et individuel facultative juridiction que le fournisseur ne peut être conforme aux dispositions du modèle de'

### Explore generated sequence

In [47]:
for i,word in enumerate(data.split()):
    if word == 'naissance':
        print(' '.join(data.split()[i-10:i+10]))
        print('\n')

54 CONTRAT D’APPROVISIONNEMENT intervenu en la ville de Lieu de naissance de l'Appel d'Offres (Ville), province de Québec, Canada. (Individuel)




### How to save and reuse the model

In [48]:
model.save('MVP1.0_LSTM_model.h5')

In [49]:
# Save the tokenizer object, as well (vocabulary, word counts)

dump(tokenizer, open('my_simple_tokenizer', 'wb'))

In [52]:
from tensorflow.keras.models import load_model
model = load_model('MVP1.0_LSTM_model.h5')
tokenizer =load(open('my_simple_tokenizer', 'rb'))

In [55]:
generate_text(model,tokenizer,seq_len,seed_text=random_seed_text,num_gen_words=20)

'défaut naissance de commande et individuel facultative juridiction que le fournisseur ne peut être conforme aux dispositions du modèle de'