In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import numpy as np
from utils import import_text, create_vocabulary
from dataset import QuijoteSeqDataset
from model import CharRNN
from sklearn.model_selection import train_test_split
from train import fit_model

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [4]:
text = import_text("el_quijote.txt")
print(text[:300]), len(text)

DON QUIJOTE DE LA MANCHA
Miguel de Cervantes Saavedra

PRIMERA PARTE
CAPÍTULO 1: Que trata de la condición y ejercicio del famoso hidalgo D. Quijote de la Mancha
En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho tiempo que vivía un hidalgo de los de lanza en astillero, adarga


(None, 1018041)

In [5]:
vocabulary = create_vocabulary(text)
vocabulary

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0cÁ¡é‘–É’«“í¿»úñóÍü”áà'

## Tokenizador
Un tokenizador corresponde a una función que nos permite convertir un texto en una secuencia de números y viceversa.

In [6]:
from tokenizer import Tokenizer

tokenizer = Tokenizer(vocabulary)
tokenizer.n_vocabulary

120

In [7]:
secuencia_ejemplo = tokenizer.text_to_seq("señor, ¿qué tal?")
secuencia_ejemplo

[28, 14, 113, 24, 27, 73, 94, 110, 26, 30, 102, 94, 29, 10, 21, 82]

In [8]:
tokenizer.seq_to_text(secuencia_ejemplo)

'señor, ¿qué tal?'

In [9]:
encoded_text = tokenizer.text_to_seq(text)
len(encoded_text)

1018041

In [10]:
text = "abcdefghij"


def create_sequences(text, window_size=3):
    text_windows = []
    for i in range(len(text) - window_size + 1):
        text_windows.append(text[i : i + window_size])
    return text_windows


encoded_windows = create_sequences(encoded_text, 100)

In [11]:
text_train, text_val = train_test_split(
    encoded_text, test_size=0.2, random_state=RANDOM_SEED, shuffle=False
)
len(text_train), len(text_val)

(814432, 203609)

## Embeddings
![](embeddings.png)

In [11]:
model = CharRNN(tokenizer.n_vocabulary)
sample_tensor = torch.randint(0, tokenizer.n_vocabulary, (32, 100))

In [12]:
dataset = dict(
    train=QuijoteSeqDataset(text_train, window_size=100),
    val=QuijoteSeqDataset(text_val, window_size=100),
)

model = CharRNN(tokenizer.n_vocabulary)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
training_params = dict(
    batch_size_train=512, batch_size_val=2048, lr=3e-4, epochs=30
)
model, loss = fit_model(model, dataset, training_params, device)

Training in cuda
Making NVIDIA GeForce RTX 2070 with Max-Q Design go brrruuummmmm....


Training Loss: 2.1824: 100%|██████████| 1590/1590 [02:24<00:00, 10.99it/s]
Validation Loss: 1.8607: 100%|██████████| 100/100 [00:12<00:00,  7.85it/s]


Epoch: 1/30, Train Loss: 2.1824, Val Loss: 1.8607


Training Loss: 1.7404: 100%|██████████| 1590/1590 [02:29<00:00, 10.62it/s]
Validation Loss: 1.6453: 100%|██████████| 100/100 [00:12<00:00,  7.86it/s]


Epoch: 2/30, Train Loss: 1.7404, Val Loss: 1.6453


Training Loss: 1.5871: 100%|██████████| 1590/1590 [02:29<00:00, 10.65it/s]
Validation Loss: 1.5416: 100%|██████████| 100/100 [00:12<00:00,  7.87it/s]


Epoch: 3/30, Train Loss: 1.5871, Val Loss: 1.5416


Training Loss: 1.4984: 100%|██████████| 1590/1590 [02:29<00:00, 10.66it/s]
Validation Loss: 1.4708: 100%|██████████| 100/100 [00:12<00:00,  7.86it/s]


Epoch: 4/30, Train Loss: 1.4984, Val Loss: 1.4708


Training Loss: 1.4377: 100%|██████████| 1590/1590 [02:29<00:00, 10.65it/s]
Validation Loss: 1.4304: 100%|██████████| 100/100 [00:12<00:00,  7.81it/s]


Epoch: 5/30, Train Loss: 1.4377, Val Loss: 1.4304


Training Loss: 1.3936: 100%|██████████| 1590/1590 [02:29<00:00, 10.65it/s]
Validation Loss: 1.3966: 100%|██████████| 100/100 [00:12<00:00,  7.76it/s]


Epoch: 6/30, Train Loss: 1.3936, Val Loss: 1.3966


Training Loss: 1.3590: 100%|██████████| 1590/1590 [02:28<00:00, 10.72it/s]
Validation Loss: 1.3714: 100%|██████████| 100/100 [00:12<00:00,  8.02it/s]


Epoch: 7/30, Train Loss: 1.3590, Val Loss: 1.3714


Training Loss: 1.3303: 100%|██████████| 1590/1590 [02:25<00:00, 10.93it/s]
Validation Loss: 1.3519: 100%|██████████| 100/100 [00:12<00:00,  7.97it/s]


Epoch: 8/30, Train Loss: 1.3303, Val Loss: 1.3519


Training Loss: 1.3058: 100%|██████████| 1590/1590 [02:25<00:00, 10.93it/s]
Validation Loss: 1.3344: 100%|██████████| 100/100 [00:13<00:00,  7.63it/s]


Epoch: 9/30, Train Loss: 1.3058, Val Loss: 1.3344


Training Loss: 1.2853: 100%|██████████| 1590/1590 [02:24<00:00, 10.99it/s]
Validation Loss: 1.3211: 100%|██████████| 100/100 [00:12<00:00,  8.02it/s]


Epoch: 10/30, Train Loss: 1.2853, Val Loss: 1.3211


Training Loss: 1.2676: 100%|██████████| 1590/1590 [02:25<00:00, 10.96it/s]
Validation Loss: 1.3120: 100%|██████████| 100/100 [00:12<00:00,  8.04it/s]


Epoch: 11/30, Train Loss: 1.2676, Val Loss: 1.3120


Training Loss: 1.2507: 100%|██████████| 1590/1590 [02:24<00:00, 10.97it/s]
Validation Loss: 1.3023: 100%|██████████| 100/100 [00:12<00:00,  8.04it/s]


Epoch: 12/30, Train Loss: 1.2507, Val Loss: 1.3023


Training Loss: 1.2370: 100%|██████████| 1590/1590 [02:25<00:00, 10.94it/s]
Validation Loss: 1.2930: 100%|██████████| 100/100 [00:12<00:00,  7.86it/s]


Epoch: 13/30, Train Loss: 1.2370, Val Loss: 1.2930


Training Loss: 1.2230: 100%|██████████| 1590/1590 [02:28<00:00, 10.67it/s]
Validation Loss: 1.2813: 100%|██████████| 100/100 [00:12<00:00,  7.83it/s]


Epoch: 14/30, Train Loss: 1.2230, Val Loss: 1.2813


Training Loss: 1.2115: 100%|██████████| 1590/1590 [02:27<00:00, 10.81it/s]
Validation Loss: 1.2784: 100%|██████████| 100/100 [00:12<00:00,  7.93it/s]


Epoch: 15/30, Train Loss: 1.2115, Val Loss: 1.2784


Training Loss: 1.1998: 100%|██████████| 1590/1590 [02:26<00:00, 10.83it/s]
Validation Loss: 1.2739: 100%|██████████| 100/100 [00:12<00:00,  7.93it/s]


Epoch: 16/30, Train Loss: 1.1998, Val Loss: 1.2739


Training Loss: 1.1898: 100%|██████████| 1590/1590 [02:30<00:00, 10.59it/s]
Validation Loss: 1.2704: 100%|██████████| 100/100 [00:12<00:00,  7.74it/s]


Epoch: 17/30, Train Loss: 1.1898, Val Loss: 1.2704


Training Loss: 1.1794: 100%|██████████| 1590/1590 [02:29<00:00, 10.64it/s]
Validation Loss: 1.2646: 100%|██████████| 100/100 [00:12<00:00,  7.75it/s]


Epoch: 18/30, Train Loss: 1.1794, Val Loss: 1.2646


Training Loss: 1.1709: 100%|██████████| 1590/1590 [02:30<00:00, 10.56it/s]
Validation Loss: 1.2628: 100%|██████████| 100/100 [00:12<00:00,  7.72it/s]


Epoch: 19/30, Train Loss: 1.1709, Val Loss: 1.2628


Training Loss: 1.1615: 100%|██████████| 1590/1590 [02:32<00:00, 10.44it/s]
Validation Loss: 1.2587: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 20/30, Train Loss: 1.1615, Val Loss: 1.2587


Training Loss: 1.1531: 100%|██████████| 1590/1590 [02:34<00:00, 10.29it/s]
Validation Loss: 1.2560: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 21/30, Train Loss: 1.1531, Val Loss: 1.2560


Training Loss: 1.1454: 100%|██████████| 1590/1590 [02:34<00:00, 10.29it/s]
Validation Loss: 1.2554: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 22/30, Train Loss: 1.1454, Val Loss: 1.2554


Training Loss: 1.1376: 100%|██████████| 1590/1590 [02:34<00:00, 10.29it/s]
Validation Loss: 1.2546: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 23/30, Train Loss: 1.1376, Val Loss: 1.2546


Training Loss: 1.1308: 100%|██████████| 1590/1590 [02:33<00:00, 10.33it/s]
Validation Loss: 1.2537: 100%|██████████| 100/100 [00:12<00:00,  7.84it/s]


Epoch: 24/30, Train Loss: 1.1308, Val Loss: 1.2537


Training Loss: 1.1235: 100%|██████████| 1590/1590 [02:25<00:00, 10.95it/s]
Validation Loss: 1.2526: 100%|██████████| 100/100 [00:12<00:00,  7.96it/s]


Epoch: 25/30, Train Loss: 1.1235, Val Loss: 1.2526


Training Loss: 1.1162: 100%|██████████| 1590/1590 [02:28<00:00, 10.70it/s]
Validation Loss: 1.2509: 100%|██████████| 100/100 [00:12<00:00,  7.76it/s]


Epoch: 26/30, Train Loss: 1.1162, Val Loss: 1.2509


Training Loss: 1.1096: 100%|██████████| 1590/1590 [02:28<00:00, 10.74it/s]
Validation Loss: 1.2510: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 27/30, Train Loss: 1.1096, Val Loss: 1.2510


Training Loss: 1.1035: 100%|██████████| 1590/1590 [02:30<00:00, 10.60it/s]
Validation Loss: 1.2494: 100%|██████████| 100/100 [00:12<00:00,  7.86it/s]


Epoch: 28/30, Train Loss: 1.1035, Val Loss: 1.2494


Training Loss: 1.0972: 100%|██████████| 1590/1590 [02:31<00:00, 10.48it/s]
Validation Loss: 1.2503: 100%|██████████| 100/100 [00:12<00:00,  7.79it/s]


Epoch: 29/30, Train Loss: 1.0972, Val Loss: 1.2503


Training Loss: 1.0912: 100%|██████████| 1590/1590 [02:34<00:00, 10.29it/s]
Validation Loss: 1.2510: 100%|██████████| 100/100 [00:12<00:00,  7.78it/s]

Epoch: 30/30, Train Loss: 1.0912, Val Loss: 1.2510





In [43]:
from model import CharRNN

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CharRNN(tokenizer.n_vocabulary)
model.to(device)
model.load_state_dict(torch.load("CharRNN_30.pth", weights_only=True))

<All keys matched successfully>

In [44]:
def predict(model, encoded_text):
    model.eval()
    with torch.no_grad():
        X = torch.tensor(encoded_text).unsqueeze(0).to(device)
        pred = model(X)
    return pred

In [45]:
initial_text = "En un lugar de la mancha, "


def generate_text(model, initial_text, chars_to_generate):
    for _ in range(chars_to_generate):
        X_encoded = tokenizer.text_to_seq(initial_text[-100:])
        y_pred = predict(model, X_encoded)
        y_pred = torch.argmax(y_pred, axis=1).item()
        initial_text += tokenizer.seq_to_text([y_pred])

    return initial_text


print(generate_text(model, initial_text, chars_to_generate=1000))

En un lugar de la mancha, y en el mundo que estaba en la mano de la mano de la mano de la mano de la mano de la mano de la mano de la mano.
-úQu” te parece -dijo don Quijote-, y que es muy bien como es el mismo que le habÉa de ser muy bien en el mundo. Pero estos dÉas de los de los caballeros andantes de la mano de la mano de la mano de la mano de la mano de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la cabeza de la

In [64]:
def generate_probabilistic_text(
    model, initial_text, chars_to_generate, temp=1
):
    for i in range(chars_to_generate):
        X_new_encoded = tokenizer.text_to_seq(initial_text[-100:])
        y_pred = predict(model, X_new_encoded)
        y_pred = y_pred.view(-1).div(temp).exp()
        top_i = torch.multinomial(y_pred, 1).item()
        predicted_char = tokenizer.seq_to_text([top_i])
        initial_text += predicted_char
    return initial_text


temp = 0.3
initial_text = "En un lugar de la mancha, "
print(
    generate_probabilistic_text(
        model, initial_text, chars_to_generate=1000, temp=temp
    )
)

En un lugar de la mancha, y de manera que el cura le dijo:
-Pues a mi se pareciÍ -dijo el cabreroé, que es muy bien que se me deja mi parte de la muerte, y que es menester que se le habÉa de perder en el mundo. Y en esto se puso en la mano de los tres de mi padre, y asÉ como los deseos hasta la vida de la mano a su parecer del pastor de la mano, y la honra de mi buen padre en el mundo, y por ser sus manos en el suelo al caso en la mano de la cabeza de la mano de la muerte de la mano; y asÉ como en la mano de la venta y el cura le dijo que no se podÉa ser estado, y asÉ le dijo que estaba en la mano de su amo en la memoria de la mano de su seíora Dulcinea del Toboso, y en el alma en la venta de la cabeza de la cabeza y de desengaío de su padre y en la venta de su amo y de su casa de la cabeza de la cabeza de la cabeza de la cabeza, y que el deseo de haber contado a la cabeza de la memoria de la mano, y en el mundo estü en el mundo que no se pudiera servir a la vida de la vida y a su padr