In [1]:
# CAREFUL ! THE model.fit() RUNS FOR ABOUT 2-3 HOURS ON CPU ! CHANGE TO GPU ! (3+ times faster)

# Load text

In [2]:
#!pip install docx2txt

In [3]:
import pandas as pd
import docx2txt
import re
import string

In [4]:
def compress(text):
  '''
  removes blank lines and replaces multiple spaces with one space
  '''
  text = text.replace('\t', ' ')
  return re.sub('\n+', '\n', text)

In [5]:
text = docx2txt.process ('/content/drive/My Drive/Colab Notebooks/Self-learning chatbot/texts/document16.docx')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Colab Notebooks/Self-learning chatbot/texts/document16.docx'

In [None]:
text

In [None]:
text = text.replace(u'\xa0', u' ')

In [None]:
text

In [None]:
text = compress(text)

In [None]:
text

## Tokenize and Clean Text

In [None]:
import spacy

In [None]:
#!python -m spacy download fr_core_news_sm

In [None]:
# To load French vocab, RESTART THE RUNTIME !!

nlp = spacy.load('fr_core_news_sm',disable=['parser', 'tagger','ner'])

In [None]:
# (Needs further fine-tuning for multiple blank lines)

def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) 
    if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n \n\n\t\t \n\n\n\n\n \n\n\n\n \n\n\n\t']

In [None]:
tokens = separate_punc(text)

In [None]:
#tokens

In [None]:
len(tokens)

## Create Sequences of Tokens

In [None]:
# organize into sequences of tokens. 
# A sequence of 20 words (for example), then predict the 21th word. 

train_len = 20+1 # training words , then one target word

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [None]:
# Given 20 words, can you predict the 21st (the last one) ?

' '.join(text_sequences[100])

In [None]:
' '.join(text_sequences[220])

In [None]:
' '.join(text_sequences[400])

In [None]:
len(text_sequences)

## Keras Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
# Integer-encode sequences of words
# Tokenizer() has many options, including punctiuation and the number of words to be kept...

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
# Each of these numbers is an id for a particular word

sequences[0]

In [None]:
tokenizer.index_word[50]

In [None]:
for i in sequences[50]:
    print(f'{i} : {tokenizer.index_word[i]}')

In [None]:
# Word counts

#tokenizer.word_counts

In [None]:
# Vocabulary size

vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

## Convert to Numpy Matrix

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

# Creating an LSTM-based model

Predict the last word in a sequence

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding # Embedding layer deals with vocabulary

In [None]:
# PARAMETERS CHOICE

# Activation = RELU
# The size of the output layer is 'vocabulary_size'
# Loss = 'categorical_crossentropy'

def create_model(vocabulary_size, seq_len):
    model = Sequential()
    # Embedding turns positive integers(indexes) into dense vectors of fixed size (see docs).
    model.add(Embedding(vocabulary_size, 20, input_length=seq_len)) 
    model.add(LSTM(150, return_sequences=True)) # better to take multiples of seq_len; smalle batches => faster
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
   
    model.summary()
    
    return model

## Feature / Label Split

In [None]:
from keras.utils import to_categorical

In [None]:
# First 20 words (compare to 'sequences' : it's everything without the last index)
sequences[:,:-1]

In [None]:
# last word
sequences[:,-1]

In [None]:
# X is the arrays of 20 words (sequences)

X = sequences[:,:-1]

# y (the target) is the 21st element
y = sequences[:,-1]

# one-hot
y = to_categorical(y, num_classes=vocabulary_size+1)

seq_len = X.shape[1]

seq_len

## Training the model

In [None]:
# define model
model = create_model(vocabulary_size+1, seq_len) # +1 for Embeddings

In [None]:
from pickle import dump,load

In [None]:
# fit model

# CAREFUL ! IT RUNS FOR ABOUT 2 HOURS ON CPU ! CHANGE TO GPU !

model.fit(X, y, batch_size=128, epochs=300,verbose=1). # epochs: at least > 200

# Generating New Text

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate 
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] # [0] returns index 
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

## Save the model

In [None]:
model.save('LSTM_model.h5')

## Grab a random seed sequence

In [None]:
text_sequences[500]

In [None]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))

In [None]:
random_seed_text = text_sequences[random_pick]
random_seed_text

In [None]:
seed_text = ' '.join(random_seed_text)
seed_text

In [None]:
## GENERATED NEW TEXT !!!

generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

## Exploring generated sequence

In [None]:
for i,word in enumerate(text.split()):
    if word == 'organisme':
        print(' '.join(text.split()[i-20:i+20]))
        print('\n')

## To reuse the model, load it

In [None]:
from keras.models import load_model
model = load_model('LSTM_model.h5')
tokenizer =load(open('LSTM_model', 'rb'))

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)

In [None]:
path_parent = os.path.dirname(os.getcwd())
source_dir = os.path.join(path_parent, "text_files")

for filename in os.listdir(source_dir):
            
    if filename[0] != ".":
        
        with open(os.path.join(source_dir, filename), "r") as f:
            data = f.read()
            
            regex = re.compile(',:;-_.\!?$/')
            data = regex.sub('', data)
            data = re.sub(r'\d+', '', data)
        
        data = data.replace("_", "")
        data = data.replace(".", "")
        data = data.replace("-", "")
        data = data.replace("$", "")
        data = data.replace(",", "")
        data = data.replace("/", "")
        data = data.replace(":", "")
        data = data.replace(")", "")
        data = data.replace("(", "")
        data = data.replace(">", "")
        data = data.replace("<", "")
        data = data.replace("[Essentielle]", "")
        data = data.replace("[Importante]", "")
        data = data.replace("[Facultative]", "")
        data = data.lower()

# TODO: could be shorter, with Unicode accented letters - ?

#         with open(os.path.join(source_dir, filename), "r") as f:
#             data = f.read()
            
#             regex = re.compile("^[a-zA-Z\u00C0-\u00FF]") 
#             regex = re.compile("^[A-Za-zÀ-ÿ]") 
#             data = regex.sub('', data)
        data = remove_space(data)
        
        with open(os.path.join(source_dir, filename),'w') as f:
            f. write(data)