### Clase 4: Laboratorio

In [1]:
# RNNs and LSTMs for NLP with BPTT

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import math
import random
import re
from datasets import load_dataset
import torch.nn.functional as F

# Section 1: Load Structured Context + Hugging Face WikiText2
print("\n=== Loading Structured + Natural Text Dataset ===")

structured_lines = [
    "if (a == b) { return a; }",
    "while (true) { break; }",
    "for (i = 0; i < n; i++) { sum += i; }",
    "def add(x, y): return x + y",
    "while (x > 0) { x--; }",
    "if (user.is_logged_in()) { show_dashboard(); }",
    "try { risky_operation(); } catch (e) { handle_error(); }",
    "The cat sat on the mat.",
    "The dog barked at the moon.",
    "The rain in Spain falls mainly on the plain."
]

structured_text = "\n".join(structured_lines).lower()

# Load WikiText2 from Hugging Face
wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")
raw_text = "\n".join(wikitext["train"]["text"]).lower()
combined_text = structured_text #+ "\n" + raw_text[:10]

# Section 2: Prepare Dataset
class CharDataset(Dataset):
    def __init__(self, text, seq_len=40):
        self.chars = sorted(set(text))
        self.char2idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx2char = {i: ch for ch, i in self.char2idx.items()}
        self.vocab_size = len(self.chars)
        self.data = [text[i:i+seq_len+1] for i in range(len(text) - seq_len)]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        chunk = self.data[i]
        x_str, y_str = chunk[:-1], chunk[1:]
        x = torch.tensor([self.char2idx[c] for c in x_str])
        y = torch.tensor([self.char2idx[c] for c in y_str])
        return x, y

seq_len = 40
text = combined_text
print(f"Loaded {len(text)} characters.")
dataset = CharDataset(text, seq_len=seq_len)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Section 3: Define Models
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return self.fc(out)

# Initialize models
vocab_size = dataset.vocab_size
rnn_model = SimpleRNN(vocab_size, embed_dim=64, hidden_dim=128)
lstm_model = LSTMModel(vocab_size, embed_dim=64, hidden_dim=128)

criterion = nn.CrossEntropyLoss()
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=0.005)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=0.005)

# Section 4: Training
print("\n=== Training Models ===")

for epoch in range(5):
    rnn_loss, lstm_loss = 0, 0
    for x, y in loader:
        # RNN
        rnn_out = rnn_model(x)
        loss_rnn = criterion(rnn_out.view(-1, vocab_size), y.view(-1))
        rnn_optimizer.zero_grad()
        loss_rnn.backward()
        rnn_optimizer.step()
        rnn_loss += loss_rnn.item()

        # LSTM
        lstm_out = lstm_model(x)
        loss_lstm = criterion(lstm_out.view(-1, vocab_size), y.view(-1))
        lstm_optimizer.zero_grad()
        loss_lstm.backward()
        lstm_optimizer.step()
        lstm_loss += loss_lstm.item()

    print(f"Epoch {epoch}, RNN Loss: {rnn_loss:.4f}, LSTM Loss: {lstm_loss:.4f}")

# Section 5: Text Generation with Sampling
print("\n=== Text Generation ===")

def sample_next_token(logits, temperature=1.0):
    logits = logits / temperature
    probs = F.softmax(logits, dim=-1)
    return torch.multinomial(probs, num_samples=1).item()

def generate_text(model, start_char, char2idx, idx2char, vocab_size, max_len=300, temperature=1.0):
    model.eval()
    input_idx = torch.tensor([[char2idx[start_char]]])
    result = [start_char]
    with torch.no_grad():
        for _ in range(max_len):
            output = model(input_idx)
            pred = output[:, -1, :]
            next_idx = sample_next_token(pred.squeeze(), temperature)
            next_char = idx2char[next_idx]
            result.append(next_char)
            input_idx = torch.tensor([[next_idx]])
    return ''.join(result)

print("\nGenerated text using RNN:")
print(generate_text(rnn_model, start_char='t', char2idx=dataset.char2idx, idx2char=dataset.idx2char, vocab_size=vocab_size, temperature=0.8))

print("\nGenerated text using LSTM:")
print(generate_text(lstm_model, start_char='t', char2idx=dataset.char2idx, idx2char=dataset.idx2char, vocab_size=vocab_size, temperature=0.8))

# Section 6: Evaluation
print("\n=== Evaluation ===")

def accuracy(model, data_loader, vocab_size):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in data_loader:
            outputs = model(x)
            predictions = torch.argmax(outputs, dim=-1)
            correct += (predictions == y).sum().item()
            total += y.numel()
    return correct / total

def perplexity(model, data_loader, vocab_size):
    model.eval()
    total_loss = 0
    total_words = 0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for x, y in data_loader:
            outputs = model(x)
            loss = criterion(outputs.view(-1, vocab_size), y.view(-1))
            total_loss += loss.item() * y.numel()
            total_words += y.numel()
    return math.exp(total_loss / total_words)

print(f"RNN Character Accuracy: {accuracy(rnn_model, loader, vocab_size):.2%}")
print(f"LSTM Character Accuracy: {accuracy(lstm_model, loader, vocab_size):.2%}")
print(f"RNN Perplexity: {perplexity(rnn_model, loader, vocab_size):.2f}")
print(f"LSTM Perplexity: {perplexity(lstm_model, loader, vocab_size):.2f}")



=== Loading Structured + Natural Text Dataset ===
Loaded 339 characters.

=== Training Models ===
Epoch 0, RNN Loss: 184.8847, LSTM Loss: 216.9732
Epoch 1, RNN Loss: 52.8883, LSTM Loss: 40.7044
Epoch 2, RNN Loss: 46.0544, LSTM Loss: 35.0998
Epoch 3, RNN Loss: 45.0642, LSTM Loss: 32.4935
Epoch 4, RNN Loss: 43.8809, LSTM Loss: 32.1529

=== Text Generation ===

Generated text using RNN:
trke he shin() se se cad_if } y
whead() }
while (): } { }
whi }
whi+ m }
whe }
whe } } sed(); }
whe = }
whind_in.
whise) x aturke) sh in(); red_i+) }
whe aturke); caturi+= }
while m turere += re) } }
whe }
whe); } turkeathe) }
whe } atururke m } aturke () () }
whe == m = () she are saturke }
tuse } }

Generated text using LSTM:
tum rke +); i; }
whe rked(ushishif ad(e rdle == brd 0; { y_ogggggggggggggggggggggggggggggggggggg_i; }
trn.
f am isad aturke 0) { pe ske ; y
trke she ndled(t rke { == > { }
if boggggggggggggggggggggggggggggggggggggggedle and }
whi; ath }
whe_d() } x { ++ } ry care (x brn(edled r.
de

In [3]:
def complete_prompt(model, prompt, char2idx, idx2char, vocab_size, max_len=100, temperature=0.8):
    model.eval()
    result = list(prompt)
    input_idx = torch.tensor([[char2idx[c] for c in prompt if c in char2idx]])
    with torch.no_grad():
        for _ in range(max_len):
            output = model(input_idx)
            pred = output[:, -1, :]
            next_idx = sample_next_token(pred.squeeze(), temperature)
            next_char = idx2char[next_idx]
            result.append(next_char)
            input_idx = torch.tensor([[char2idx[c] for c in result[-seq_len:] if c in char2idx]])
    return ''.join(result)
example_prompts = [
    "if (a == b) {",
    "while (x < 10) {",
    "for (i = 0; i < n; i++) {",
    "try {",
    "The cat"
]

for prompt in example_prompts:
    print(f"\nPrompt: {prompt}")
    rnn_completion = complete_prompt(rnn_model, prompt.lower(), dataset.char2idx, dataset.idx2char, vocab_size)
    lstm_completion = complete_prompt(lstm_model, prompt.lower(), dataset.char2idx, dataset.idx2char, vocab_size)
    print(f"RNN completion:  {rnn_completion}")
    print(f"LSTM completion: {lstm_completion}")


Prompt: if (a == b) {
RNN completion:  if (a == b) { return a; }
while (true) { handle_error(); }
the cat sat on the mat.
the dog barked at the moon.
th
LSTM completion: if (a == b) { return a; }
while (x > 0) { x--; }
if (user.is_logged_in())) { show_dashboard(); }
try { risky_oper

Prompt: while (x < 10) {
RNN completion:  while (x < 10) { show_dashboard(); }
try { risky_operation(); }
the cat sat on the mat.
the dog barked at the moon.

LSTM completion: while (x < 10) { x--; }
if (user.is_logged_in()) { show_dashboard(); }
try { risky_operation(); } catch (e) { handle

Prompt: for (i = 0; i < n; i++) {
RNN completion:  for (i = 0; i < n; i++) { sum += i; }
def add(x, y): return a; }
while (true) { handle_error(); }
the cat sat on the mat.
the
LSTM completion: for (i = 0; i < n; i++) { sum += i; }
def add(x, y): return x + y
while (x > 0) { x--; }
if (user.is_logged_in()) { show_dash

Prompt: try {
RNN completion:  try { risky_operation(); }
the cat sat on the mat.
the dog barked at