# Text Generator



This notebook contains a deep learning algorithm used to generate different texts based on a seed (sentence). Below, the first steps taken are presented: importing the required libraries and downloading the dataset.

In [8]:
#Importing relevant libraries 
from essential_generators import DocumentGenerator
from essential_generators import MarkovWordGenerator
from essential_generators import MarkovTextGenerator
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential
from string import punctuation
import tensorflow as tf
import numpy as np
import requests
import pickle
import os
import re

In [12]:
#Downloading dataset ('Wonderland' book) and saving it in a folder called 'data'.
content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text
open("/ESADE/wonderland.txt", "w", encoding="utf-8").write(content)

FileNotFoundError: [Errno 2] No such file or directory: '/ESADE/wonderland.txt'

## Transformations of dataset

In this section of the code, several steps are taken to transform and extract the data required to train the deep learning model.

In [None]:
#Declaration of relevant variables to train the model
sequence_length = 100
BATCH_SIZE = 300
EPOCHS = 30 

#Saving the dataset file path (data folder)
FILE_PATH = "/content/drive/MyDrive/ESADE/data/wonderland.txt"
BASENAME = os.path.basename(FILE_PATH)

#Reading the dataset
text = open(FILE_PATH, encoding="utf-8").read()[:-50000]

#Removing capital letters and punctuation
text = text.lower() 
text = text.translate(str.maketrans("", "", punctuation))

vocab = ''.join(sorted(set(text)))  #sorted vocabulary of the text (dataset)
n_chars = len(text)                 #number of characters of the text
n_unique_chars = len(vocab)         #number of unique characters of the text

Below, two dictionaries are being created and saved to generate texts based on them. One of the dictionaries has the characters of the text converted to integers and the other one the integers converted to characters. 

In [148]:
#Creating and saving dictionary that contains characters converted to integers
char2int = {c: i for i, c in enumerate(vocab)}
pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb"))

#Creating and saving dictionary that contains integers converted to characters
int2char = {i: c for i, c in enumerate(vocab)}
pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb"))

Further steps are taken below to create sequences of texts (sentences or paragraphs). For this, 2 main functions are generated: split_sample( ) and one_hot_samples( ).

In [68]:
#Converting all the text into integers
encoded_text = np.array([char2int[c] for c in text])

#Building tf.data.Dataset object from the encoded text
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

#Building sequences by batching
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True)

In [69]:
def split_sample(sample):
  '''
  Function to split the text in tuple of inputs and targets.

  Parameters:
  sample: Sequences of characters from dataset.

  Output: 
  ds: Tuples of inputs (previous characters) and targets (next characters).
  '''
  ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
  for i in range(1, (len(sample)-1) // 2):
    input_ = sample[i: i+sequence_length]
    target = sample[i+sequence_length]
    other_ds = tf.data.Dataset.from_tensors((input_, target))
    ds = ds.concatenate(other_ds)
    return ds

def one_hot_samples(input_, target):
    '''
  Function to one hot encode the tuples of inputs and targets.

  Parameters:
  input_: Previous characters.
  target: Next characters expected based on input_.

  Output: 
  Tuples of inputs (previous characters) and targets (next characters) one hot encoded.
  '''
    return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars)

#Preparing the inputs and targets
dataset = sequences.flat_map(split_sample)
dataset = dataset.map(one_hot_samples)

#Repeat, shuffle and batch the dataset
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True)

## Building, training and saving the Model

In this section, a deep learning model with a simple architecture is created, trained and saved to be later used to predict sentences based on different seeds (other phrases, texts).

In [73]:
#Building a sequential deep learning model (using neural networks)
model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True), #Input layer of the model with shape (none, 100, 256)
    Dropout(0.3),                                                                    #Dropout to reduce overfitting
    LSTM(256),                                                                       #Hidden layer with same shape as input layer.
    Dense(n_unique_chars, activation="softmax"),                                     #Final dense layer with activation function softmax and shape (none,37)
])

#Defining the path where the model needs to be saved (results folder)
model_weights_path = f"results/{BASENAME}-{sequence_length}.h5"

#Printing a summary of the architecture of the model and compiling the metric, loss and optimizer of the model.
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 100, 256)          301056    
                                                                 
 dropout_1 (Dropout)         (None, 100, 256)          0         
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dense_1 (Dense)             (None, 37)                9509      
                                                                 
Total params: 835,877
Trainable params: 835,877
Non-trainable params: 0
_________________________________________________________________


In [74]:
#Training the model
model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS)

#Saving the model in the defined path
model.save(model_weights_path)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
