In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
f = open("/content/drive/MyDrive/IA2/Cuentos.txt", "r", encoding='utf-8')
text = f.read()
text[:300], len(text)

('El águila, el cuervo y el pastor\n\nLanzándose desde una cima, un águila arrebató a un corderito.\nLa vio un cuervo y tratando de imitar al águila, se lanzó sobre un\ncarnero, pero con tan mal conocimiento en el arte que sus garras se\nenredaron en la lana, y batiendo al máximo sus alas no logró\nsoltarse',
 3449)

## Tokenización

In [4]:
import string
all_characters = string.printable + "ñÑáÁéÉíÍóÓúÚ¿¡"
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0cñÑáÁéÉíÍóÓúÚ¿¡'

In [5]:
import string

class Tokenizer(): 
    
  def __init__(self):
    self.all_characters = all_characters
    self.n_characters = len(self.all_characters)
    
  def text_to_seq(self, string):
    seq = []
    for c in range(len(string)):
        try:
            seq.append(self.all_characters.index(string[c]))
        except:
            continue
    return seq

  def seq_to_text(self, seq):
    text = ''
    for c in range(len(seq)):
        text += self.all_characters[seq[c]]
    return text

tokenizer = Tokenizer()
tokenizer.n_characters

114

In [6]:
tokenizer.text_to_seq('El águila')

[40, 21, 94, 102, 16, 30, 18, 21, 10]

In [7]:
tokenizer.seq_to_text([40, 21, 94, 102, 16, 30, 18, 21, 10])

'El águila'

In [8]:
text_encoded = tokenizer.text_to_seq(text)

In [9]:
train_size = len(text_encoded) * 80 // 100 
train = text_encoded[:train_size]
test = text_encoded[train_size:]

len(train), len(test)

(2759, 690)

In [10]:
import random

def windows(text, window_size = 100):
    start_index = 0
    end_index = len(text) - window_size
    text_windows = []
    while start_index < end_index:
      text_windows.append(text[start_index:start_index+window_size+1])
      start_index += 1
    return text_windows

text_encoded_windows = windows(text_encoded)

In [11]:
print(tokenizer.seq_to_text((text_encoded_windows[0])))
print()
print(tokenizer.seq_to_text((text_encoded_windows[1])))
print()
print(tokenizer.seq_to_text((text_encoded_windows[2])))

El águila, el cuervo y el pastor

Lanzándose desde una cima, un águila arrebató a un corderito.
La vi

l águila, el cuervo y el pastor

Lanzándose desde una cima, un águila arrebató a un corderito.
La vio

 águila, el cuervo y el pastor

Lanzándose desde una cima, un águila arrebató a un corderito.
La vio 


In [12]:
import torch

class CharRNNDataset(torch.utils.data.Dataset):
  def __init__(self, text_encoded_windows, train=True):
    self.text = text_encoded_windows
    self.train = train

  def __len__(self):
    return len(self.text)

  def __getitem__(self, ix):
    if self.train:
      return torch.tensor(self.text[ix][:-1]), torch.tensor(self.text[ix][-1])
    return torch.tensor(self.text[ix])

In [38]:
train_text_encoded_windows = windows(train)
test_text_encoded_windows = windows(test)

dataset = {
    'train': CharRNNDataset(train_text_encoded_windows),
    'val': CharRNNDataset(test_text_encoded_windows)
}

dataloader = {
    'train': torch.utils.data.DataLoader(dataset['train'], batch_size=64, shuffle=True, pin_memory=True),
    'val': torch.utils.data.DataLoader(dataset['val'], batch_size=50, shuffle=False, pin_memory=True),
}

len(dataset['train']), len(dataset['val'])

(2659, 590)

In [39]:
input, output = dataset['train'][1]
tokenizer.seq_to_text(input)

'l águila, el cuervo y el pastor\n\nLanzándose desde una cima, un águila arrebató a un corderito.\nLa vi'

In [40]:
tokenizer.seq_to_text([output])

'o'

In [41]:
class CharRNN(torch.nn.Module):
  def __init__(self, input_size, embedding_size=100, hidden_size=300, num_layers=2, dropout=0.5):
    super().__init__()
    self.encoder = torch.nn.Embedding(input_size, embedding_size)
    self.rnn = torch.nn.GRU(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
    self.fc = torch.nn.Linear(hidden_size, input_size)

  def forward(self, x):
    x = self.encoder(x)
    x, h = self.rnn(x)         
    y = self.fc(x[:,-1,:])
    return y

In [42]:
model = CharRNN(input_size=tokenizer.n_characters)
outputs = model(torch.randint(0, tokenizer.n_characters, (64, 50)))
outputs.shape

torch.Size([64, 114])

In [43]:
from tqdm import tqdm
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

def fit(model, dataloader, epochs=10):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(1, epochs+1):
        model.train()
        train_loss = []
        bar = tqdm(dataloader['train'])
        for batch in bar:
            X, y = batch
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            y_hat = model(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
            bar.set_description(f"loss {np.mean(train_loss):.5f}")
        bar = tqdm(dataloader['val'])
        val_loss = []
        model.eval()
        with torch.no_grad():
            for batch in bar:
                X, y = batch
                X, y = X.to(device), y.to(device)
                y_hat = model(X)
                loss = criterion(y_hat, y)
                val_loss.append(loss.item())
                bar.set_description(f"val_loss {np.mean(val_loss):.5f}")
        print(f"Epoch {epoch}/{epochs} loss {np.mean(train_loss):.5f} val_loss {np.mean(val_loss):.5f}")

def predict(model, X):
    model.eval() 
    with torch.no_grad():
        X = torch.tensor(X).to(device)
        pred = model(X.unsqueeze(0))
        return pred

In [44]:
model = CharRNN(input_size=tokenizer.n_characters)
fit(model, dataloader, epochs=20)

loss 3.34539: 100%|██████████| 42/42 [00:45<00:00,  1.08s/it]
val_loss 3.23678: 100%|██████████| 12/12 [00:02<00:00,  4.09it/s]


Epoch 1/20 loss 3.34539 val_loss 3.23678


loss 3.08937: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 3.07907: 100%|██████████| 12/12 [00:02<00:00,  4.08it/s]


Epoch 2/20 loss 3.08937 val_loss 3.07907


loss 2.94199: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.89930: 100%|██████████| 12/12 [00:02<00:00,  4.08it/s]


Epoch 3/20 loss 2.94199 val_loss 2.89930


loss 2.70537: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.68001: 100%|██████████| 12/12 [00:02<00:00,  4.12it/s]


Epoch 4/20 loss 2.70537 val_loss 2.68001


loss 2.50442: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.55933: 100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


Epoch 5/20 loss 2.50442 val_loss 2.55933


loss 2.36207: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.42024: 100%|██████████| 12/12 [00:02<00:00,  4.08it/s]


Epoch 6/20 loss 2.36207 val_loss 2.42024


loss 2.22094: 100%|██████████| 42/42 [00:44<00:00,  1.06s/it]
val_loss 2.37463: 100%|██████████| 12/12 [00:02<00:00,  4.13it/s]


Epoch 7/20 loss 2.22094 val_loss 2.37463


loss 2.10144: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.38071: 100%|██████████| 12/12 [00:02<00:00,  4.08it/s]


Epoch 8/20 loss 2.10144 val_loss 2.38071


loss 1.99300: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.35179: 100%|██████████| 12/12 [00:02<00:00,  4.09it/s]


Epoch 9/20 loss 1.99300 val_loss 2.35179


loss 1.90044: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.31868: 100%|██████████| 12/12 [00:02<00:00,  4.09it/s]


Epoch 10/20 loss 1.90044 val_loss 2.31868


loss 1.80797: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.34924: 100%|██████████| 12/12 [00:02<00:00,  4.09it/s]


Epoch 11/20 loss 1.80797 val_loss 2.34924


loss 1.69453: 100%|██████████| 42/42 [00:44<00:00,  1.06s/it]
val_loss 2.37876: 100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


Epoch 12/20 loss 1.69453 val_loss 2.37876


loss 1.59887: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.38724: 100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


Epoch 13/20 loss 1.59887 val_loss 2.38724


loss 1.49947: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.42705: 100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


Epoch 14/20 loss 1.49947 val_loss 2.42705


loss 1.42207: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.43441: 100%|██████████| 12/12 [00:02<00:00,  4.06it/s]


Epoch 15/20 loss 1.42207 val_loss 2.43441


loss 1.33784: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.49419: 100%|██████████| 12/12 [00:02<00:00,  4.08it/s]


Epoch 16/20 loss 1.33784 val_loss 2.49419


loss 1.23380: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.56266: 100%|██████████| 12/12 [00:02<00:00,  4.12it/s]


Epoch 17/20 loss 1.23380 val_loss 2.56266


loss 1.17402: 100%|██████████| 42/42 [00:44<00:00,  1.06s/it]
val_loss 2.60758: 100%|██████████| 12/12 [00:02<00:00,  4.10it/s]


Epoch 18/20 loss 1.17402 val_loss 2.60758


loss 1.10539: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.70002: 100%|██████████| 12/12 [00:02<00:00,  4.06it/s]


Epoch 19/20 loss 1.10539 val_loss 2.70002


loss 1.01093: 100%|██████████| 42/42 [00:42<00:00,  1.02s/it]
val_loss 2.74127: 100%|██████████| 12/12 [00:02<00:00,  4.09it/s]

Epoch 20/20 loss 1.01093 val_loss 2.74127





In [26]:
X_new = "El águila, el cuervo y el "
X_new_encoded = tokenizer.text_to_seq(X_new)
y_pred = predict(model, X_new_encoded)
y_pred = torch.argmax(y_pred, axis=1)[0].item()
tokenizer.seq_to_text([y_pred])

'p'

In [45]:
for i in range(100):
  X_new_encoded = tokenizer.text_to_seq(X_new[-100:])
  y_pred = predict(model, X_new_encoded)
  y_pred = torch.argmax(y_pred, axis=1)[0].item()
  X_new += tokenizer.seq_to_text([y_pred])

X_new

'El águila, el cuervo y el perro\n\nEl caballo lobo en el perro\n\nEl caballo lobo en el perro\n\nEl caballo lo hobres alos por la galondidiento peri, la garcarse. Perie.v a nidre por las habres alresó\n¡ lúi diente ente bien la fle ha de las garras de los garras de sus garras de las garras de las garras de sus prospecie el perro.\n-'

In [46]:
temp=1
for i in range(100):
  X_new_encoded = tokenizer.text_to_seq(X_new[-100:])
  y_pred = predict(model, X_new_encoded)
  y_pred = y_pred.view(-1).div(temp).exp()
  top_i = torch.multinomial(y_pred, 1)[0]
  predicted_char = tokenizer.all_characters[top_i]
  X_new += predicted_char

print(X_new)

El águila, el cuervo y el perro

El caballo lobo en el perro

El caballo lobo en el perro

El caballo lo hobres alos por la galondidiento peri, la garcarse. Perie.v a nidre por las habres alresó
¡ lúi diente ente bien la fle ha de las garras de los garras de sus garras de las garras de las garras de sus prospecie el perro.
- YTú diisigomo mí fierro en lagó ba ubos puro, y él zarpa- ¡ue suy agarrades de los dioses la llegó 
