<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/transformer_demo_fp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch -q

In [3]:
!pip show transformers
print()
!pip show datasets
print()
!pip show torch


Name: transformers
Version: 4.50.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers

Name: datasets
Version: 3.5.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tq

In [48]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np

import os

# Set CUDA_LAUNCH_BLOCKING environment variable
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


# --- Hyperparameters ---
batch_size = 16
sequence_length = 128
embedding_dimension = 256
num_heads = 8
feed_forward_dimension = 1024
num_encoder_layers = 6
dropout_probability = 0.1
learning_rate = 1e-4
num_epochs = 1
tokenizer_name = 'gpt2'
pad_token = '<pad>'

# --- 1. Load Tokenizer and Add Pad Token ---
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': pad_token})
pad_token_id = tokenizer.pad_token_id


In [49]:
# Load and Preprocess Flight Plan Data ---
flight_plan_dataset = load_dataset("frankmorales2020/flight_plan_waypoints")

In [None]:
flight_plan_dataset

In [50]:
def preprocess_flight_plan(example):
    # Convert waypoints to string, handling potential non-string/numeric data and empty waypoints
    waypoints_str = ' '.join([' '.join([str(float(coord)) if coord is not None else '' for coord in wp if coord is not None]) for wp in example['waypoints'] if wp])

    # Tokenize and enforce fixed sequence length
    tokens = tokenizer(
        waypoints_str,
        truncation=True,
        max_length=sequence_length,
        padding="max_length",  # Pad to the specified sequence length
        return_tensors="pt",
    )

    # Ensure consistent shape (important for batching) - squeezing after padding
    tokens['input_ids'] = tokens['input_ids'].squeeze(0)
    tokens['attention_mask'] = tokens['attention_mask'].squeeze(0)

    # Handle extremely short sequences: If after tokenization the sequence is still shorter than sequence_length, we pad it.
    # Modification: Pad to sequence_length if needed
    pad_len = sequence_length - tokens['input_ids'].shape[0]
    if pad_len > 0:
        tokens['input_ids'] = torch.cat([tokens['input_ids'], torch.tensor([pad_token_id] * pad_len)])
        tokens['attention_mask'] = torch.cat([tokens['attention_mask'], torch.tensor([0] * pad_len)])
    elif pad_len < 0:
        # Truncate if the sequence is longer than sequence_length
        tokens['input_ids'] = tokens['input_ids'][:sequence_length]
        tokens['attention_mask'] = tokens['attention_mask'][:sequence_length]

    return tokens

In [55]:
def custom_collate_fn(batch):
    # 1. Process 'input_ids' and 'attention_mask'
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]

    # Check for empty input_ids and skip those batches
    if any(len(ids) == 0 for ids in input_ids):
        print("Skipping batch with empty input_ids")  # Add this line to identify skipped batches
        return None  # Skip this batch

    # Pad input_ids and attention_mask to the maximum length in the batch
    # Ensure max_len is capped at block_size (sequence_length)
    max_len = min(max(len(ids) for ids in input_ids), sequence_length) # Enforce maximum sequence length

    # Convert ids and mask to tensors before concatenation
    # Ensure input_ids are of type long
    # MODIFICATION: Convert ids and mask to tensors before calling .type()
    padded_input_ids = [torch.cat([torch.tensor(ids, dtype=torch.long), torch.tensor([pad_token_id] * (max_len - len(ids)), dtype=torch.long)]) for ids in input_ids]
    padded_attention_mask = [torch.cat([torch.tensor(mask, dtype=torch.long), torch.tensor([0] * (max_len - len(mask)), dtype=torch.long)]) for mask in attention_mask]

    # Stack the padded tensors
    input_ids = torch.stack(padded_input_ids)
    attention_mask = torch.stack(padded_attention_mask)


    return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [52]:
# Create DataLoader ---
processed_dataset = flight_plan_dataset['train'].map(preprocess_flight_plan)
train_loader = DataLoader(processed_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

In [56]:
# --- 4. Define the Transformer Model (Decoder-Only) ---
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == embed_dim, "Embed dim must be divisible by num heads"
        self.query_proj = nn.Linear(embed_dim, embed_dim)
        self.key_proj = nn.Linear(embed_dim, embed_dim)
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout_probability)

    def forward(self, x, attention_mask=None):
        B, T, E = x.size()
        q = self.query_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.key_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.value_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
        if attention_mask is not None:
            att = att.masked_fill(attention_mask[:, None, None, :] == 0, float('-inf'))
        att = torch.softmax(att, dim=-1)
        att = self.dropout(att)
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, E)
        y = self.out_proj(y)
        return y

class FeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(dropout_probability),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.sa = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.ffwd = FeedForward(embed_dim, ff_hidden_dim)

    def forward(self, x, attention_mask=None):
        x = x + self.sa(self.ln1(x), attention_mask)
        x = x + self.ffwd(self.ln2(x))
        return x

class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_hidden_dim, num_layers, block_size, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.Sequential(*[Block(embed_dim, num_heads, ff_hidden_dim) for _ in range(num_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.block_size = block_size

    def forward(self, idx, attention_mask=None):
        print(f"idx shape: {idx.shape}, idx dtype: {idx.dtype}, idx device: {idx.device}")  # Print idx info
        # Check if idx has the expected number of dimensions
        if idx.dim() != 2:
            # Raise a more informative error message
            raise ValueError(f"Expected idx to have 2 dimensions, but got {idx.dim()} dimensions. Shape: {idx.shape}")

        B, T = idx.shape  # Change idx.size() to idx.shape
        idx = idx.type(torch.long)  # Ensure idx is of type long
        tok_emb = self.token_embedding(idx)

        print(f"tok_emb shape: {tok_emb.shape}, tok_emb dtype: {tok_emb.dtype}, tok_emb device: {tok_emb.device}")  # Print tok_emb info
        # Ensure position embeddings are created on the same device and with dtype torch.long
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device, dtype=torch.long))
        print(f"pos_emb shape: {pos_emb.shape}, pos_emb dtype: {pos_emb.dtype}, pos_emb device: {pos_emb.device}")  # Print pos_emb info

        x = self.dropout(tok_emb + pos_emb)
        print(f"x shape after dropout: {x.shape}, x dtype: {x.dtype}, x device: {x.device}")  # Print x info after dropout

        for block in self.blocks:
            x = block(x, attention_mask)
            print(f"x shape after block: {x.shape}, x dtype: {x.dtype}, x device: {x.device}")  # Print x info after each block

        x = self.ln_f(x)
        print(f"x shape after ln_f: {x.shape}, x dtype: {x.dtype}, x device: {x.device}")  # Print x info after ln_f
        logits = self.head(x)
        print(f"logits shape: {logits.shape}, logits dtype: {logits.dtype}, logits device: {logits.device}")  # Print logits info
        return logits


    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits = self(idx_cond)
            logits = logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Instantiate the model
vocab_size = tokenizer.vocab_size
block_size = sequence_length
model = SimpleTransformer(vocab_size, embedding_dimension, num_heads, feed_forward_dimension, num_encoder_layers, block_size, dropout_probability).to('cuda' if torch.cuda.is_available() else 'cpu')

# --- 5. Loss Function and Optimizer ---
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# --- 6. Training Loop ---
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        # Skip batches with empty input_ids
        if batch['input_ids'].shape[1] == 0:
            continue

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = input_ids[:, 1:].contiguous()
        inputs = input_ids[:, :-1].contiguous()


        optimizer.zero_grad()
        outputs = model(inputs, attention_mask)
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        if batch_idx % 100 == 0:
            print(f"Batch [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")
    return total_loss / len(dataloader)

if __name__ == "__main__":
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    for epoch in range(num_epochs):
        epoch_loss = train(model, train_loader, optimizer, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

    print("Training finished!")

    # --- 7. (Optional) Inference/Generation ---
    def generate_flight_plan(model, tokenizer, start_text="YUL ", max_new_tokens=100, device='cpu'):
        model.eval()
        start_tokens = tokenizer.encode(start_text, return_tensors='pt').to(device)
        generated_tokens = model.generate(start_tokens, max_new_tokens=max_new_tokens)
        return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    if __name__ == "__main__":
        if torch.cuda.is_available():
            generated_plan = generate_flight_plan(model.to('cuda'), tokenizer, start_text="YUL ", max_new_tokens=200, device='cuda')
            model.to('cpu')
        else:
            generated_plan = generate_flight_plan(model, tokenizer, start_text="YUL ", max_new_tokens=200, device='cpu')
        print("\nGenerated Flight Plan:")
        print(generated_plan)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
