# Transformer Model

## Import Required Libraries

In [32]:
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, T5Tokenizer
from torch.optim import AdamW
from tqdm import tqdm

## Load Dataset

In [33]:
# Dataset directory
dataset_path = "../../data/processed/tokenized_dataset"

# Load dataset
tokenized_dataset = load_from_disk(dataset_path)

# Inspect datasets
print(tokenized_dataset)

# Fix labels padding token
def preprocess_labels(examples):
    labels = examples["labels"]
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels]
    return {"labels": labels}

tokenized_dataset = tokenized_dataset.map(preprocess_labels, batched = True)

# Extract training and testing splits
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

# Remove string/text columns that cause tensor conversion errors
columns_to_remove = ["prompt", "response", "__index_level_0__"]
train_dataset = train_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    test: Dataset({
        features: ['prompt', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 77
    })
})


## Initialize Pretrained Transformer ModelCreate Dataloaders

In [None]:
MODEL_NAME = "t5-small"
MAX_LEN = 512
BATCH_SIZE = 8
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(DEVICE)

# Optimizer
optimizer = AdamW(model.parameters(), lr = 5e-5)

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, padding = True)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, collate_fn = data_collator)
test_loader = DataLoader(test_dataset, batch_size = batch_size, collate_fn = data_collator)

## Run Model

In [None]:
# Training loop with validation
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids = input_ids, 
                        attention_mask = attention_mask, 
                        labels = labels)
        
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids = input_ids, 
                            attention_mask = attention_mask, 
                            labels = labels)
            
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_loader)

    print(f"Epoch {epoch + 1} | Training Loss: {avg_train_loss:.4f} | Validation Loss: {avg_val_loss:.4f}")

Epoch 1 | Train Loss: 1.0647 | Val Loss: 0.8917
Epoch 2 | Train Loss: 0.8457 | Val Loss: 0.8078
Epoch 3 | Train Loss: 0.7910 | Val Loss: 0.7721


## Save the Model

In [36]:
# Save trained model and tokenizer
model.save_pretrained("../../models/")
tokenizer.save_pretrained("../../models/")
print(f"\nModel and tokenizer saved to ../../models/")


Model and tokenizer saved to ../../models/


## Custom Itinerary Generation

In [46]:
def generate_itinerary(prompt, model, tokenizer, device = DEVICE, max_length = 256):
    model.eval()
    inputs = tokenizer(prompt, return_tensors = "pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids = inputs["input_ids"],
            attention_mask = inputs["attention_mask"],
            max_length = max_length,
            num_beams = 4,
            early_stopping = True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens = True)

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("../../models/").to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained("../../models/")

# Create custom prompt
prompt = "Generate a 3 day itinerary in the United States during spring. Activity preferences include hiking."
itinerary = generate_itinerary(prompt, model, tokenizer)

print(itinerary)

3 Day Itinerary for the United States day 1: travel to the u.s. day 1: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s. day 2: travel to the u.s.
