# Desafío 3:
## Modelos de Lenguaje — Generación de Secuencias
## Utilizar otro dataset y poner en práctica la generación de secuencias con las estrategias presentadas en la clase 4 (los ejercicios figuran en el repo como clase 3).

# En este notebook entrenaremos modelos char-level y word-level para generación de texto con estrategias de sampling (greedy, temperatura, top-k).

In [1]:
# 0. Importaciones y carga de datos
!pip install datasets
from datasets import load_dataset
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Cargar Wikitext-2 raw para entrenar un modelo de lenguaje
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
text = "\n".join(dataset['text'])

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [2]:
# 1. Char-level Model

# 1.1 Preprocesamiento: mapear chars a índices
chars = sorted(list(set(text)))
char2idx = {c:i for i,c in enumerate(chars)}
idx2char = np.array(chars)
seq_len = 100  # tamaño de ventana
step = 1

# Construir secuencias y etiquetas
sentences = []
next_chars = []
for i in range(0, len(text) - seq_len, step):
    sentences.append(text[i: i + seq_len])
    next_chars.append(text[i + seq_len])
print(f"Total sequences: {len(sentences)}")

# Vectorizar: shape=(n_seq, seq_len)
X = np.zeros((len(sentences), seq_len), dtype=np.int32)
y = np.zeros((len(sentences),), dtype=np.int32)
for i, sent in enumerate(sentences):
    X[i] = [char2idx[c] for c in sent]
    y[i] = char2idx[next_chars[i]]

# 1.2 Definición del modelo char-level
evocab = len(chars)
char_model = tf.keras.Sequential([
    layers.Embedding(input_dim=evocab, output_dim=64, input_length=seq_len),
    layers.LSTM(128, return_sequences=True),
    layers.LSTM(128),
    layers.Dense(evocab, activation='softmax')
])
char_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
char_model.summary()

# 1.3 Entrenamiento
char_model.fit(X, y, batch_size=128, epochs=10)

Total sequences: 10929607




Epoch 1/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1265s[0m 15ms/step - loss: 2.0650
Epoch 2/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1261s[0m 15ms/step - loss: 1.6136
Epoch 3/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1262s[0m 15ms/step - loss: 1.5425
Epoch 4/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1261s[0m 15ms/step - loss: 1.5076
Epoch 5/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1266s[0m 15ms/step - loss: 1.4884
Epoch 6/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1263s[0m 15ms/step - loss: 1.4767
Epoch 7/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1260s[0m 15ms/step - loss: 1.4665
Epoch 8/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1259s[0m 15ms/step - loss: 1.4599
Epoch 9/10
[1m85388/85388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1259s[0m 15ms/step - loss: 1.4535
Epoch 10/10
[1m853

<keras.src.callbacks.history.History at 0x7c7a6f33a810>

In [3]:
# 1.4 Funciones de sampling
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def sample_top_k(preds, k=5):
    preds = np.asarray(preds).astype('float64')
    idxs = np.argsort(preds)[-k:]
    probs = preds[idxs] / np.sum(preds[idxs])
    return np.random.choice(idxs, p=probs)

# 1.5 Generación de texto char-level
def generate_char_text(model, seed, length=400, method='greedy', temperature=1.0, k=5):
    generated = seed
    for _ in range(length):
        seq = np.array([[char2idx.get(c,0) for c in generated[-seq_len:]]])
        preds = model.predict(seq, verbose=0)[0]
        if method == 'greedy':
            next_idx = np.argmax(preds)
        elif method == 'temperature':
            next_idx = sample_with_temperature(preds, temperature)
        elif method == 'top_k':
            next_idx = sample_top_k(preds, k)
        else:
            raise ValueError("Método desconocido")
        generated += idx2char[next_idx]
    return generated

# Ejemplo:
seed = text[:seq_len]
print(generate_char_text(char_model, seed, method='temperature', temperature=0.5))


 = Valkyria Chronicles III = 


 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリgralymp ) was a nation of the Aland , and John Liter . 

 The competition of other commander , a features the first possible a more than the fall and the Hope . 

 The first may be a particularly the same police of the game has a season . 

 The most former contract of the recognition to the seventh as a raid in the first possible . The first programs between the war and writes in all good music g


In [4]:
# 2. Word-level Model

# 2.1 Preprocesamiento: Tokenizer Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token='<OOV>')
lines = text.split('\n')
tokenizer.fit_on_texts(lines)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Crear secuencias de palabras de longitud L
L = 20
input_sequences = []
for line in lines:
    seq = tokenizer.texts_to_sequences([line])[0]
    for i in range(L, len(seq)):
        input_sequences.append(seq[i-L:i+1])
print(f"Total word-sequences: {len(input_sequences)}")

Total word-sequences: 1425393


In [5]:
# Dividir en X (primeras L) e y (última palabra)
input_sequences = np.array(input_sequences)
Xw, yw = input_sequences[:,:-1], input_sequences[:,-1]

# 2.2 Definir modelo word-level
word_model = tf.keras.Sequential([
    layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=L),
    layers.LSTM(256),
    layers.Dense(vocab_size, activation='softmax')
])
word_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
word_model.summary()

# 2.3 Entrenamiento
word_model.fit(Xw, yw, batch_size=256, epochs=5)

Epoch 1/5
[1m5568/5568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 30ms/step - loss: 7.4380
Epoch 2/5
[1m5568/5568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 30ms/step - loss: 6.1568
Epoch 3/5
[1m5568/5568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 30ms/step - loss: 5.5238
Epoch 4/5
[1m5568/5568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 30ms/step - loss: 5.0428
Epoch 5/5
[1m5568/5568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 30ms/step - loss: 4.6395


<keras.src.callbacks.history.History at 0x7c79644aa6d0>

In [6]:
# 2.4 Función de generación word-level
def generate_word_text(model, seed_text, next_words=50, method='greedy', temperature=1.0, k=5):
    result = seed_text
    for _ in range(next_words):
        seq = tokenizer.texts_to_sequences([result.split()])
        pad_seq = pad_sequences(seq, maxlen=L, truncating='pre')
        preds = model.predict(pad_seq, verbose=0)[0]
        if method == 'greedy':
            idx = np.argmax(preds)
        elif method == 'temperature':
            idx = sample_with_temperature(preds, temperature)
        elif method == 'top_k':
            idx = sample_top_k(preds, k)
        else:
            raise ValueError("Método desconocido")
        next_word = tokenizer.index_word.get(idx, '')
        result += ' ' + next_word
    return result

# Ejemplo:
print(generate_word_text(word_model, "The quick brown fox jumps", method='top_k', k=10))

The quick brown fox jumps and a pair of high explosive hung the clock bar the ball is the ball for the game the ball and tech did not pitch a ball for a first down the team completed their final week of the second three minutes of the season after scoring the final match


# En resúmen, hicimos:
## Char-level
* Preprocesamiento de caracteres
* Modelo LSTM de dos capas
* Sampling: greedy, temperature, top-k
* Función generate_char_text con métodos ajustables

## Word-level
* Tokenización de oraciones en palabras (ventanas de 20)
* Modelo LSTM
* Función generate_word_text con las mismas estrategias de sampling

## Para entrenar, ir probando:
* Ajustar epochs, tamaños de hidden, probar bidireccional en LSTM.
* Comparar resultados de greedy vs sampling (diversidad vs coherencia).
* Explorar beam search para word-level.