# 1. Text cleaning

### Import libraries and create utilitary functions

In [9]:
import pandas as pd
import re
import string
import os
import random
import spacy

In [10]:
#!python -m spacy download fr_core_news_sm

In [11]:
import tensorflow as tf

In [12]:
# Set up path for the text files

path_parent = os.path.dirname(os.getcwd())
source_dir = os.path.join(path_parent, "text_files")

In [13]:
def remove_space(text):
    text = text.strip()
    text = text.split()
    return " ".join(text)

def show_random_text(sourceDir):
    rand_text = random.choice(os.listdir(sourceDir))
    
    while rand_text[0] == ".":
        rand_text = random.choice(os.listdir(sourceDir))

    with open(os.path.join(source_dir, rand_text), "r") as f:
            data = f.read()
    print(data)
    
def read_single_text(sourceDir):
    rand_text = random.choice(os.listdir(sourceDir))
    
    while rand_text[0] == ".":
        rand_text = random.choice(os.listdir(sourceDir))

    with open(os.path.join(source_dir, rand_text), "r") as f:
            data = f.read()
    return data

In [14]:
data = read_single_text(source_dir)

In [15]:
data

"APPEL D’OFFRES NO DOCUMENT D’APPEL D’OFFRES (Sans contrat intégré) Page 2 de 3 Contrat Biens techniques (Équipement) Version détaillée 2019-12-20 «Denom_Soc_D_O» APPEL D’OFFRES NO «AO_No» «AO_Titre» Documentation d’appel d’offres APPEL D’OFFRES NO DOCUMENTATION D’APPELS D’OFFRES BIENS TECHNIQUES (ÉQUIPEMENT) CONTRAT Version détaillée (2019-12-20) . TABLE DES MATIÈRES PAGE PRÉAMBULE 11 0.00 INTERPRÉTATION 12 0.01 Terminologie 12 0.01.01 Appel d'Offres [Essentielle] 12 0.01.02 Avis d'Adjudication [Essentielle] 12 0.01.03 Bien [Importante] 12 0.01.04 Bon de Commande [Essentielle] 12 0.01.05 Bordereau de Prix [Essentielle] 12 0.01.06 Changement de Contrôle [Essentielle] 12 0.01.07 Charge [Essentielle] 13 0.01.08 Consommable [Importante] 13 0.01.09 Consommable en Consignation [Importante] 13 0.01.10 Contrat [Essentielle] 13 0.01.11 Devis [Essentielle] 13 0.01.12 Documentation d'Usager [Essentielle] 13 0.01.13 Documents d'Appel d'Offres [Essentielle] 14 0.01.14 Échéancier [Importante] 14 0.

### Clean text and transform into tokens

In [17]:
def clean_text(doc):
    tokens = doc.split()
    table = str.maketrans('','',string.punctuation)        # make a translation table
    tokens = [(w.translate(table)) for w in tokens]        # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]   # remove non-alphanumeric special characters
    tokens = [word.lower() for word in tokens]             # convert into lowercase letters
    tokens = [word for word in tokens if word != 'essentielle']
    tokens = [word for word in tokens if word != 'importante']
    tokens = [word for word in tokens if word != 'facultative']
    return tokens

def clean_text_properly(doc):
    tokens = doc.split()
    tokens = [word for word in tokens if word != '[Essentielle]']
    tokens = [word for word in tokens if word != '[Importante]']
    tokens = [word for word in tokens if word != '[Facultative]']
    table = str.maketrans('','',string.punctuation)        # make a translation table
    tokens = [(w.translate(table)) for w in tokens]        # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]   # remove non-alphanumeric special characters
    tokens = [word.lower() for word in tokens]             # convert into lowercase letters
    return tokens

In [18]:
tokens = clean_text_properly(data)

In [20]:
print(tokens[:500])

['appel', 'no', 'document', 'sans', 'contrat', 'intégré', 'page', 'de', 'contrat', 'biens', 'techniques', 'équipement', 'version', 'détaillée', 'appel', 'no', 'documentation', 'appel', 'no', 'documentation', 'biens', 'techniques', 'équipement', 'contrat', 'version', 'détaillée', 'table', 'des', 'matières', 'page', 'préambule', 'interprétation', 'terminologie', 'appel', 'doffres', 'avis', 'dadjudication', 'bien', 'bon', 'de', 'commande', 'bordereau', 'de', 'prix', 'changement', 'de', 'contrôle', 'charge', 'consommable', 'consommable', 'en', 'consignation', 'contrat', 'devis', 'documentation', 'dusager', 'documents', 'dappel', 'doffres', 'échéancier', 'établissement', 'participant', 'formulaire', 'de', 'soumission', 'institution', 'financière', 'option', 'organisme', 'public', 'partie', 'personne', 'personne', 'liée', 'renseignement', 'confidentiel', 'renseignement', 'personnel', 'représentants', 'légaux', 'service', 'soumission', 'travaux', 'primauté', 'droit', 'applicable', 'généralité

In [21]:
len(tokens)

16294

In [22]:
len(set(tokens))  # number of unique words

2023

# 2. Create text sequences 

The number of tokens in a sequence depends on the nature of the document. For legal documents, we'll start with 25 words that should be enough to capture the context of the sentence. 

In [23]:
train_len = 25+1    # 25 training words and one target word

# Make an empty list of sequences
lines = []

for i in range(train_len, len(tokens)):   # from 0 to 25
    seq = tokens[i-train_len:i]           # create a sequence of 26 tokens 
    line = ' '.join(seq)                  # join tokens to create a line
    lines.append(line)
    if i > 100000:                        # take 100k words to train the model, to reduce the resources
        break

In [24]:
len(lines)

16268

In [25]:
lines[0]

'appel no document sans contrat intégré page de contrat biens techniques équipement version détaillée appel no documentation appel no documentation biens techniques équipement contrat version détaillée'

In [26]:
tokens[0], tokens[25]

('appel', 'détaillée')

In [27]:
lines[1]

'no document sans contrat intégré page de contrat biens techniques équipement version détaillée appel no documentation appel no documentation biens techniques équipement contrat version détaillée table'

In [28]:
tokens[1], tokens[26]

('no', 'table')

In [40]:
import random
random.seed(101)
random_pick = random.randint(0,len(lines))

In [42]:
lines[random_pick]

'consignation vérification le fournisseur doit procéder à une vérificationinspection périodique de des consommables en consignation et remplacer tout consommable périmé ou sur le point de le'

# 3. Tokenization: convert text sequences to number sequences

In [29]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
# Create a Tokenizer object, fit it on the text, and transform to numerical sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) 
sequences = tokenizer.texts_to_sequences(lines) 

In [31]:
# Each of these numbers is an id for a particular word

sequences[0]

[324,
 443,
 205,
 43,
 13,
 2023,
 811,
 1,
 13,
 25,
 325,
 1201,
 812,
 810,
 324,
 443,
 157,
 324,
 443,
 157,
 25,
 325,
 1201,
 13,
 812,
 810]

In [32]:
tokenizer.index_word[25]

'biens'

In [33]:
for i in sequences[25]:
    print(f'{i} : {tokenizer.index_word[i]}')

810 : détaillée
1203 : table
7 : des
518 : matières
811 : page
813 : préambule
814 : interprétation
815 : terminologie
324 : appel
111 : doffres
62 : avis
630 : dadjudication
31 : bien
131 : bon
1 : de
90 : commande
122 : bordereau
1 : de
23 : prix
177 : changement
1 : de
178 : contrôle
286 : charge
85 : consommable
85 : consommable
12 : en


In [34]:
# Tokenizer can give a dictionary of all word counts

#tokenizer.word_counts

In [35]:
# Vocabulary size

vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2023

### Convert to Numpy Matrix

In [38]:
# 'sequences' is a list of lists, we can convert it to numpy matrix
sequences = np.array(sequences)

In [39]:
sequences

array([[ 324,  443,  205, ...,   13,  812,  810],
       [ 443,  205,   43, ...,  812,  810, 1203],
       [ 205,   43,   13, ...,  810, 1203,    7],
       ...,
       [ 698,   84,   28, ...,    1,   64, 2022],
       [  84,   28,    6, ...,   64, 2022,   13],
       [  28,    6,  323, ..., 2022,   13,   37]])

# 4. Feature / label split (create x and y)

Predict the last word in a sequence

In [32]:
# First 25 words (compare to 'sequences' : it's everything without the last index)
sequences[:,:-1]

array([[ 230,  202,  143, ...,  130,  117,  396],
       [ 202,  143,   37, ...,  117,  396,   27],
       [ 143,   37,    9, ...,  396,   27, 1001],
       ...,
       [ 438,   62,   29, ...,  654,    1,   85],
       [  62,   29,    6, ...,    1,   85, 1697],
       [  29,    6,  178, ...,   85, 1697,    9]])

In [33]:
# last word
sequences[:,-1]

array([  27, 1001, 1000, ..., 1697,    9,   34])

In [39]:
x, y = sequences[:,:-1], sequences[:,-1]

In [40]:
x[0].shape

(25,)

In [42]:
x.shape # the number of sequences for the sliding window of 25 words

(11356, 25)

In [41]:
y.shape # a column of last words for each sequence

(11356,)

In [43]:
# convert those words to one-hot matrix
y = to_categorical(y, num_classes=vocabulary_size+1)
y.shape

(11356, 1699)

In [44]:
seq_len = x.shape[1]
seq_len

25

# 5. Building an LSTM model

In [45]:
# PARAMETERS CHOICE

# Activation = RELU
# The size of the output layer is 'vocabulary_size'
# Loss = 'categorical_crossentropy'

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    # Embedding turns positive integers(indexes) into dense vectors of fixed size (see docs).
    
    # First LSTM layer
    model.add(Embedding(input_dim=vocabulary_size, output_dim=25, input_length=seq_len)) 
    
    # First LSTM layer
    model.add(LSTM(units=100, return_sequences=True)) # better to take multiples of seq_len; smalle batches => faster
    
    # Second LSTM layer
    model.add(LSTM(100))
    
    # Dense layer
    model.add(Dense(100, activation='relu'))
    
    # Final layer
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

In [46]:
# define model
model = create_model(vocabulary_size+1, seq_len) # +1 for Embeddings

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            42475     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 100)           50400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 1699)              171599    
Total params: 354,974
Trainable params: 354,974
Non-trainable params: 0
_________________________________________________________________


In [49]:
# Train the model

model.fit(x, y, batch_size=256, epochs=100,verbose=1) # epochs: at least > 200

Train on 11356 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/10

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x64ab41650>

# 6. Generating New Text

In [52]:
from random import randint
from pickle import dump, load
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [53]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : the model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by the model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    # this is a sentence to add a word and chop off the first word
    # and repet as many times as the number of words the user wants to generate
    input_text = seed_text  
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the first token (number)
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate 
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] # [0] returns index 
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Get a random seed sequence

In [57]:
lines[500]

'municipalité province de québec code postal ciaprès dénommée public et le fournisseur dûment identifié dans émis conformément aux modalités de portant le numéro numéro de lappel'

In [58]:
import random
random.seed(101)
random_pick = random.randint(0,len(lines))

In [59]:
random_seed_text = lines[random_pick]
random_seed_text

'montant forfaitaire de la garantie dexécution la caution consent à ce que public et le fournisseur puissent en tout temps faire des modifications au contrat sous'

### Generate new text

In [61]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

'due résulte avantageuses dans un délai de ct personne morale dans le contrat et à la période de ceuxci fiduciaire'

### Explore generated sequence

In [65]:
for i,word in enumerate(data.split()):
    if word == 'avantageuses':
        print(' '.join(data.split()[i-10:i+10]))
        print('\n')

les conditions ou politiques de vente du FOURNISSEUR soient plus avantageuses pour l’ORGANISME PUBLIC. Droit applicable [Essentielle] Le Contrat s’interprète




### How to save and reuse the model

In [54]:
model.save('Legal_LSTM_model.h5')

In [55]:
# Save the tokenizer object, as well (vocabulary, word counts)

dump(tokenizer, open('my_simple_tokenizer', 'wb'))

In [68]:
from tensorflow.keras.models import load_model
model = load_model('Legal_LSTM_model.h5')
tokenizer =load(open('my_simple_tokenizer', 'rb'))

In [69]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

'due résulte avantageuses dans un délai de ct personne morale dans le contrat et à la période de ceuxci fiduciaire'