# Text Generation

In [20]:
import keras

from keras.layers import GRU, Dense, Flatten, Conv1D, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

from numpy import array, argmax
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [24]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1511791155544320778
]


## Utils

In [2]:
def text_to_encode(text, alphabet):
    values = array(list(alphabet))
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

    text = list(text)
    encoded_phrase = onehot_encoder.transform(label_encoder.transform(text).reshape(len(text), 1))
    return encoded_phrase, label_encoder, onehot_encoder

def encode_to_text(encode, label_encoder):
    text = ""

    for i in encode:
        inverted = label_encoder.inverse_transform([argmax(i)])
        text += inverted[0]
  
    return text

def parse(text):
    clean_text = re.sub('\ +', ' ', text.replace('\n', ' ').replace('\r', ' '))
    sigma      = set(text)
  
    return clean_text, sigma

## Training set

In [3]:
import re

INPUT_FILEPATH = 'chat.txt'
WINDOW_SIZE    = 100

X_train = None; y_train = None

with open(INPUT_FILEPATH, 'r') as input_file:
    text, sigma = parse(input_file.read())
  
    X_train = np.zeros((len(text) - WINDOW_SIZE + 1, WINDOW_SIZE - 1, len(sigma)))
    y_train = np.zeros((len(text) - WINDOW_SIZE + 1, 1, len(sigma)))
  
    encoded_text, label_encoder, onehot_encoder = text_to_encode(text, sigma)
  
    i = 0
    while i + WINDOW_SIZE < len(encoded_text) + 1:
        X_train[i] = encoded_text[i:i + WINDOW_SIZE - 1]
        y_train[i] = encoded_text[i + WINDOW_SIZE - 1]
        i += 1

# Ejemplos de correctitud
# print(text_train_X[0], len(text_train_X[0]), text_train_y[0], len(text_train_y[0]))
# print(text_train_X[-1], len(text_train_X[-1]), text_train_y[-1], len(text_train_y[-1]))

In [4]:
y_train = y_train.reshape(y_train.shape[0], y_train.shape[2])

In [5]:
X_train.shape, y_train.shape

((432, 99, 41), (432, 41))

In [13]:
encode_to_text(X_train[111], label_encoder)

'no puedes levantarte tarde Hay que comer chorrillanas Lo sé lo sé Pondré mil alarmas Perfecto Ya di'

## Model

In [16]:
model = Sequential()
model.add(Conv1D(filters=150, kernel_size=(3,), input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(GRU(256, return_sequences=True))
model.add(GRU(256, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(len(sigma), activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 97, 150)           18600     
_________________________________________________________________
gru_3 (GRU)                  (None, 97, 256)           312576    
_________________________________________________________________
gru_4 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 41)                10537     
Total params: 735,697
Trainable params: 735,697
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], )

In [21]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [27]:
model.fit(X_train, y_train, epochs=10, batch_size=128, callbacks=callbacks_list)

Epoch 1/10

Epoch 00001: loss improved from 3.08341 to 3.05979, saving model to weights-improvement-01-3.0598.hdf5
Epoch 2/10

Epoch 00002: loss improved from 3.05979 to 3.05362, saving model to weights-improvement-02-3.0536.hdf5
Epoch 3/10

Epoch 00003: loss improved from 3.05362 to 2.99888, saving model to weights-improvement-03-2.9989.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.99888 to 2.97562, saving model to weights-improvement-04-2.9756.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.97562 to 2.97474, saving model to weights-improvement-05-2.9747.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.97474 to 2.87904, saving model to weights-improvement-06-2.8790.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.87904 to 2.81263, saving model to weights-improvement-07-2.8126.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.81263 to 2.74405, saving model to weights-improvement-08-2.7440.hdf5
Epoch 9/10

Epoch 00009: loss improved from 2.74405 to 2.65605, saving model to 

<keras.callbacks.History at 0x119ac9908>

In [28]:
model.save_weights('my_model_weights.h5')

## Test

### ToDo: Create a real test set

In [38]:
seed = X_train[120]
print("Seed: ")
encode_to_text(seed, label_encoder)

Seed: 


' levantarte tarde Hay que comer chorrillanas Lo sé lo sé Pondré mil alarmas Perfecto Ya dime Ya Jaj'

In [39]:
new_text = encode_to_text(seed, label_encoder)

for _ in range(200):
    new_letter = model.predict(np.array([seed]))
    new_text += encode_to_text(new_letter, label_encoder)   
    seed = np.vstack((seed[1:], new_letter))

In [40]:
print(new_text)

 levantarte tarde Hay que comer chorrillanas Lo sé lo sé Pondré mil alarmas Perfecto Ya dime Ya Jajaaaaaaa                                                                                                                                                                                                 
