In [2]:
import numpy as np
import os

os.environ["KERAS_BACKEND"] = "torch"

In [3]:
import keras
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm


In [4]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [5]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [6]:
"""
Map all characters to an integer, starting at 2.
TextVectorization reserves 0 for padding tokens and 1 for unknown characters.
"""
text_vec_layer = keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0].cpu().numpy()

2024-08-22 11:57:03.762442: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-22 11:57:03.771522: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-22 11:57:03.785078: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-22 11:57:03.788897: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-22 11:57:03.802302: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instru

In [7]:
encoded = torch.tensor(encoded, dtype=torch.long) - 2  # don't need tokens 0 and 1
n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars
dataset_size = len(encoded)

In [8]:
# Convert the entire dataset to a single tensor of sequences
sequence_length = 100
stride = 1
sequences = encoded.unfold(0, sequence_length + 1, stride)

# Split the data
train_size = int(0.9 * len(sequences))
val_size = int(0.05 * len(sequences))
test_size = len(sequences) - train_size - val_size
train_data, val_data, test_data = torch.utils.data.random_split(sequences, [train_size, val_size, test_size])

# Create DataLoaders
batch_size = 1024
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)

# Define the model
class ShakespeareModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        return self.fc(output)

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ShakespeareModel(n_tokens, embedding_dim=16, hidden_dim=128).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        inputs, targets = batch[:, :-1].to(device), batch[:, 1:].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, n_tokens), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch[:, :-1].to(device), batch[:, 1:].to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs.view(-1, n_tokens), targets.view(-1)).item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

torch.save(model.state_dict(), "shakespeare_model.pth")

Epoch 1/10: 100%|██████████| 981/981 [00:26<00:00, 37.56it/s]


Epoch 1/10, Train Loss: 1.9607, Val Loss: 1.5598


Epoch 2/10: 100%|██████████| 981/981 [00:25<00:00, 38.29it/s]


Epoch 2/10, Train Loss: 1.4842, Val Loss: 1.4367


Epoch 3/10: 100%|██████████| 981/981 [00:25<00:00, 38.32it/s]


Epoch 3/10, Train Loss: 1.4143, Val Loss: 1.3965


Epoch 4/10: 100%|██████████| 981/981 [00:25<00:00, 38.30it/s]


Epoch 4/10, Train Loss: 1.3849, Val Loss: 1.3753


Epoch 5/10: 100%|██████████| 981/981 [00:25<00:00, 38.38it/s]


Epoch 5/10, Train Loss: 1.3679, Val Loss: 1.3618


Epoch 6/10: 100%|██████████| 981/981 [00:25<00:00, 38.43it/s]


Epoch 6/10, Train Loss: 1.3566, Val Loss: 1.3526


Epoch 7/10: 100%|██████████| 981/981 [00:25<00:00, 38.08it/s]


Epoch 7/10, Train Loss: 1.3484, Val Loss: 1.3459


Epoch 8/10: 100%|██████████| 981/981 [00:25<00:00, 38.44it/s]


Epoch 8/10, Train Loss: 1.3423, Val Loss: 1.3403


Epoch 9/10: 100%|██████████| 981/981 [00:25<00:00, 38.46it/s]


Epoch 9/10, Train Loss: 1.3373, Val Loss: 1.3357


Epoch 10/10: 100%|██████████| 981/981 [00:25<00:00, 38.28it/s]


Epoch 10/10, Train Loss: 1.3333, Val Loss: 1.3321


In [15]:
def generate_text(model, start_text, num_generate=50, temperature=1.0):
    # Convert start text to tensor
    input_sequence = text_vec_layer([start_text.lower()])[0].cpu().numpy() - 2
    input_sequence = torch.tensor(input_sequence, dtype=torch.long).unsqueeze(0).to(device)

    model.eval()
    generated_text = start_text
    
    with torch.no_grad():
        for _ in range(num_generate):
            # Get the last 'sequence_length' characters
            input_sequence = input_sequence[:, -sequence_length:]
            
            # Generate prediction
            output = model(input_sequence)
            
            # Apply temperature
            output = output[:, -1, :] / temperature
            probabilities = torch.nn.functional.softmax(output, dim=-1)
            
            # Sample from the distribution
            next_char_index = torch.multinomial(probabilities, 1).item()
            
            # Convert back to character and append to generated text
            next_char = text_vec_layer.get_vocabulary()[next_char_index + 2]  # +2 because we subtracted 2 earlier
            generated_text += next_char
            
            # Update input sequence
            input_sequence = torch.cat([input_sequence, torch.tensor([[next_char_index]], device=device)], dim=1)
    
    return generated_text

# Load the trained model
model.load_state_dict(torch.load("shakespeare_model.pth"))
model.to(device)

# Generate text
start_text = "to be or not to b"
generated_text = generate_text(model, start_text, num_generate=1000, temperature=0.7)
print(generated_text)

to be or not to be here,
i repent the clifford of a bah what flies,
that was he look their blood, mark me the senately to be now?

prospero:
they be the measure so sure:
that last she hath as it cannot be help,
with this grave bole you that stay and brawn he fall,
i understand a princes, all the love of my devil
a man of time again to she were sweet man;
it is with the duke of york water repetly forth to be dist.

claudio:
i spoil the seat mourn struck'd
to have bearing of a humble and to be book and look on.

lady anne:
what, mistress, have stand on the seas-
thy ready the first court; let me so, the seirs.

king richard iii:
pain in his love, and the commons:
the remembrace that she is the pale,
and see the courses in your parquity
is thy power and may seem on the gentleman:
how most be so coming well of you.

henry bolingbroke:
i take the mother shall hear his prince hath stay piece
with my widows come not a part and discipler:
i here call down the blood is the rest
in slave and gon

In [10]:
import tensorflow as tf

In [7]:
"""
We'll turn the long sequence into a dataset of windows that we can use to train a sequence-to-sequence RNN.
Targets are similar to the inputs, but shifted by one time step into the 'future'.

Example:
One sample may be a sequence of character IDs representing the text "to be or not to b"
And the corresponding target would be a sequence of character IDs representing the text "o be or not to be"
"""

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    """
    Convert a sequence into a TensorFlow dataset of windowed samples for sequence-to-sequence training.

    Args:
        sequence (tf.Tensor): The input sequence to be windowed.
        length (int): The length of each window (excluding the target token).
        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
        seed (int, optional): Random seed for shuffling. Defaults to None.
        batch_size (int, optional): The batch size for the dataset. Defaults to 32.

    Returns:
        tf.data.Dataset: A dataset of windowed samples, where each sample is a tuple (input_window, target_window).
                         The input_window is of shape (batch_size, length) and the target_window is of shape (batch_size, length).
                         The target_window is shifted one step ahead of the input_window.

    The function performs the following steps:
    1. Creates a dataset from the input sequence.
    2. Windows the dataset into overlapping sequences of length + 1, as we need the next character for the target.
    3. Optionally shuffles the dataset.
    4. Batches the dataset.
    5. Maps the windowed sequences into input-target pairs.
    6. Prefetches one batch to optimize performance.
    """
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [8]:
# Prepare dataset. Roughly: 90% train, 5% valid, 5% test.
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[model_ckpt])

# NMT - English to Spanish

In [5]:
from pathlib import Path

In [6]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = keras.utils.get_file("spa-eng.zip", origin=url, extract=True, cache_dir="datasets")
text = (Path(path).parent / "spa-eng" / "spa.txt").read_text(encoding="utf-8")

In [7]:
import numpy as np

In [8]:
text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines() if len(line) >= 3]
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)

In [9]:
for i in range(3):
    print(sentences_en[i], "=>", sentences_es[i])

She is kissing him. => Ella lo está besando.
I have a life. => Tengo una vida.
Let me see your prescription. => Déjame ver tu receta médica.


In [10]:
vocab_size = 1_000 # this is small because the training set is small & using a small value speeds up training. SOTA models uses a lot more (e.g. 30k), uses larger training sets, and larger models.
max_length = 50 # all sentences in this dataset have a max of 50 words.
text_vec_layer_en = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=max_length,
)
text_vec_layer_es = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=max_length,
)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {text} endofseq" for text in sentences_es]) # adding start and end tokens

2024-08-29 13:31:54.050813: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-29 13:31:54.128562: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-29 13:31:54.197004: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-29 13:31:54.214973: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-29 13:31:54.353694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instru

In [12]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split

# Convert sentences to tensors
sentences_en_tensor = torch.tensor(text_vec_layer_en(sentences_en).cpu().numpy())

# Prepare decoder inputs (Spanish sentences with SOS prefix)
decoder_inputs = torch.tensor(text_vec_layer_es([f"startofseq {text}" for text in sentences_es]).cpu().numpy())

# Prepare targets (Spanish sentences with EOS suffix)
targets = torch.tensor(text_vec_layer_es([f"{text} endofseq" for text in sentences_es]).cpu().numpy())

# Create a TensorDataset with three elements: encoder input, decoder input, and target
dataset = TensorDataset(sentences_en_tensor, decoder_inputs, targets)

# Split the dataset
train_size = min(100000, len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
batch_size = 32  # You can adjust this as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

Training set size: 100000
Validation set size: 18964


In [13]:
import torch.nn as nn

class EncoderDecoder(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.encoder = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.decoder = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        
        self.input_embedding = nn.Embedding(input_dim, emb_dim)
        self.output_embedding = nn.Embedding(output_dim, emb_dim)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.fc_out.out_features
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(src.device)
        
        # Encoder
        embedded_src = self.dropout(self.input_embedding(src))
        _, (hidden, cell) = self.encoder(embedded_src)
        
        # Decoder
        input = trg[0,:]
        for t in range(1, trg_len):
            embedded_input = self.dropout(self.output_embedding(input))
            output, (hidden, cell) = self.decoder(embedded_input.unsqueeze(0), (hidden, cell))
            prediction = self.fc_out(output.squeeze(0))
            outputs[t] = prediction
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            input = trg[t] if teacher_force else top1
        
        return outputs

# Instantiate the model
input_dim = vocab_size  # size of input vocabulary
output_dim = vocab_size  # size of output vocabulary
emb_dim = 256  # embedding dimension
hid_dim = 512  # hidden dimension
n_layers = 2  # number of LSTM layers
dropout = 0.5

model = EncoderDecoder(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout)
print(model)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the padding index
optimizer = torch.optim.Adam(model.parameters())

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

EncoderDecoder(
  (encoder): LSTM(256, 512, num_layers=2, dropout=0.5)
  (decoder): LSTM(256, 512, num_layers=2, dropout=0.5)
  (input_embedding): Embedding(1000, 256)
  (output_embedding): Embedding(1000, 256)
  (fc_out): Linear(in_features=512, out_features=1000, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [16]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        src, trg_input, trg_output = [t.to(device).transpose(0, 1) for t in batch]
        
        optimizer.zero_grad()
        output = model(src, trg_input)
        
        # Reshape output and target, ensuring same batch size
        output = output[1:].reshape(-1, output.shape[-1])
        trg_output = trg_output.transpose(0, 1)[1:].reshape(-1)
        
        # Truncate to the minimum length
        min_len = min(output.size(0), trg_output.size(0))
        output = output[:min_len]
        trg_output = trg_output[:min_len]
        
        loss = criterion(output, trg_output)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        
        total_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            src, trg_input, trg_output = [t.to(device).transpose(0, 1) for t in batch]
            output = model(src, trg_input, 0)  # Turn off teacher forcing
            
            # Reshape output and target, ensuring same batch size
            output = output[1:].reshape(-1, output.shape[-1])
            trg_output = trg_output.transpose(0, 1)[1:].reshape(-1)
            
            # Truncate to the minimum length
            min_len = min(output.size(0), trg_output.size(0))
            output = output[:min_len]
            trg_output = trg_output[:min_len]
            
            val_loss += criterion(output, trg_output).item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

# Save the model
torch.save(model.state_dict(), "nmt_model.pth")

Epoch 1/10: 100%|██████████| 3125/3125 [04:20<00:00, 11.98it/s]


Epoch 1/10, Train Loss: 4.6693, Val Loss: 4.6486


Epoch 2/10: 100%|██████████| 3125/3125 [04:24<00:00, 11.80it/s]


Epoch 2/10, Train Loss: 4.6558, Val Loss: 4.6468


Epoch 3/10: 100%|██████████| 3125/3125 [04:16<00:00, 12.20it/s]


Epoch 3/10, Train Loss: 4.6543, Val Loss: 4.6450


Epoch 4/10: 100%|██████████| 3125/3125 [04:14<00:00, 12.27it/s]


Epoch 4/10, Train Loss: 4.6530, Val Loss: 4.6441


Epoch 5/10: 100%|██████████| 3125/3125 [04:16<00:00, 12.19it/s]


Epoch 5/10, Train Loss: 4.6530, Val Loss: 4.6437


Epoch 6/10: 100%|██████████| 3125/3125 [04:26<00:00, 11.72it/s]


Epoch 6/10, Train Loss: 4.6528, Val Loss: 4.6441


Epoch 7/10: 100%|██████████| 3125/3125 [04:22<00:00, 11.91it/s]


Epoch 7/10, Train Loss: 4.6522, Val Loss: 4.6431


Epoch 8/10: 100%|██████████| 3125/3125 [04:24<00:00, 11.84it/s]


Epoch 8/10, Train Loss: 4.6515, Val Loss: 4.6435


Epoch 9/10: 100%|██████████| 3125/3125 [04:05<00:00, 12.71it/s]


Epoch 9/10, Train Loss: 4.6520, Val Loss: 4.6436


Epoch 10/10: 100%|██████████| 3125/3125 [04:38<00:00, 11.20it/s]


Epoch 10/10, Train Loss: 4.6519, Val Loss: 4.6424
