In [None]:
import nltk
nltk.download('punkt')
!pip install rouge


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import string
from transformers import GPT2Tokenizer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from rouge import Rouge
import random
import nltk
import pandas as pd
import os

nltk.download('punkt')

# ---------------------------
# Data Loading and Preprocessing
# ---------------------------

# Load the dataset
data_path = 'shakespeare.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"The file {data_path} does not exist in the current directory.")

df = pd.read_csv(data_path)

# Preprocessing
lines_df = df[df['PlayerLine'].notna()].copy()
play_titles = df[df['Player'].isna()]['Play'].unique()

# Create corpus
corpus = []
current_play = None
for _, row in df.iterrows():
    if pd.isna(row['Player']):
        if row['PlayerLine'] is not None and (row['PlayerLine'].startswith("ACT") or row['PlayerLine'].startswith("SCENE")):
            continue
        else:
            current_play = row['Play']
    elif pd.notna(row['PlayerLine']):
        corpus.append({
            'Play': current_play,
            'Player': row['Player'],
            'Line': row['PlayerLine']
        })

# ---------------------------
# Model Architecture
# ---------------------------

class LayerNormCustom(nn.Module):
    def __init__(self, hidden_dim, eps=1e-5):
        super(LayerNormCustom, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_dim))
        self.beta = nn.Parameter(torch.zeros(hidden_dim))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, unbiased=False, keepdim=True)
        normalized = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * normalized + self.beta

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, hidden_dim, max_len=5000):
        super(SinusoidalPositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, hidden_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class MultiHeadAttentionCustom(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(MultiHeadAttentionCustom, self).__init__()
        assert hidden_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads

        self.W_q = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.W_k = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.W_v = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        self.W_o = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))

        nn.init.xavier_uniform_(self.W_q)
        nn.init.xavier_uniform_(self.W_k)
        nn.init.xavier_uniform_(self.W_v)
        nn.init.xavier_uniform_(self.W_o)

    def forward(self, x):
        batch_size, seq_length, hidden_dim = x.size()

        Q = torch.matmul(x, self.W_q)
        K = torch.matmul(x, self.W_k)
        V = torch.matmul(x, self.W_v)

        Q = Q.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
        K = K.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
        V = V.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        mask = torch.tril(torch.ones(seq_length, seq_length)).to(x.device)
        scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1,2).contiguous().view(batch_size, seq_length, hidden_dim)
        out = torch.matmul(context, self.W_o)

        return out

class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_dim, ff_dim, activation=F.relu):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(hidden_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, hidden_dim)
        self.activation = activation

        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)

    def forward(self, x):
        return self.fc2(self.activation(self.fc1(x)))

class TransformerBlock(nn.Module):
    def __init__(self, hidden_dim, num_heads, ff_dim):
        super(TransformerBlock, self).__init__()
        self.ln1 = LayerNormCustom(hidden_dim)
        self.mha = MultiHeadAttentionCustom(hidden_dim, num_heads)
        self.ln2 = LayerNormCustom(hidden_dim)
        self.ffn = FeedForwardNetwork(hidden_dim, ff_dim)

    def forward(self, x):
        x_norm = self.ln1(x)
        attn_out = self.mha(x_norm)
        x = x + attn_out

        x_norm = self.ln2(x)
        ffn_out = self.ffn(x_norm)
        x = x + ffn_out

        return x

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, hidden_dim=384, num_layers=6, num_heads=6, ff_dim=1536, max_seq_length=512):
        super(DecoderOnlyTransformer, self).__init__()
        self.hidden_dim = hidden_dim
        self.token_embedding = nn.Embedding(vocab_size, hidden_dim)
        self.positional_encoding = SinusoidalPositionalEncoding(hidden_dim, max_len=max_seq_length)
        self.layers = nn.ModuleList([
            TransformerBlock(hidden_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.ln_f = LayerNormCustom(hidden_dim)
        self.lm_head = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.lm_head.weight = self.token_embedding.weight

    def forward(self, input_ids):
        x = self.token_embedding(input_ids)
        x = self.positional_encoding(x)

        for layer in self.layers:
            x = layer(x)

        x = self.ln_f(x)
        logits = self.lm_head(x)

        return logits

# ---------------------------
# Dataset and DataLoader
# ---------------------------

class ShakespeareDataset(Dataset):
    def __init__(self, encoded_corpus, block_size):
        self.block_size = block_size
        self.data = []

        for line in encoded_corpus:
            if len(line) < 2:
                continue

            if len(line) > block_size:
                line = line[:block_size]
            else:
                padding = [subword_tokenizer.pad_token_id] * (block_size - len(line))
                line = line + padding

            inputs = line[:-1]
            targets = line[1:]

            self.data.append((inputs, targets))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs, targets = self.data[idx]
        return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)

# ---------------------------
# Training Utils
# ---------------------------

def compute_perplexity_with_progress(model, data_loader, device):
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0.0
    total_tokens = 0

    val_bar = tqdm(data_loader, desc='Computing perplexity',
                  position=0, leave=True)

    with torch.no_grad():
        for inputs, targets in val_bar:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.view(-1, outputs.size(-1))
            targets = targets.view(-1)

            loss = loss_fn(outputs, targets)
            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

            current_perplexity = math.exp(total_loss / total_tokens)
            val_bar.set_postfix({'perplexity': f'{current_perplexity:.2f}'})

    avg_loss = total_loss / total_tokens
    return math.exp(avg_loss)

def train_model(model, train_loader, val_loader, num_epochs, device, learning_rate=3e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    best_val_perplexity = float('inf')

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0

        train_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}',
                        position=0, leave=True)

        for inputs, targets in train_bar:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            outputs = outputs.view(-1, outputs.size(-1))
            targets = targets.view(-1)

            loss = nn.CrossEntropyLoss()(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            train_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        avg_epoch_loss = epoch_loss / len(train_loader)

        val_perplexity = compute_perplexity_with_progress(model, val_loader, device)

        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Average Loss: {avg_epoch_loss:.4f}")
        print(f"Validation Perplexity: {val_perplexity:.2f}")

        if val_perplexity < best_val_perplexity:
            best_val_perplexity = val_perplexity
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"New best model saved! (Perplexity: {val_perplexity:.2f})")
        print()

def generate_continuation(model, tokenizer, prompt_ids, max_length=100, context_size=256,
                           sampling_strategy='greedy', top_k=50, device='cpu'):
    model.eval()
    input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device)

    with torch.no_grad():
        for _ in range(max_length):
            if input_ids.size(1) > context_size:
                input_ids = input_ids[:, -context_size:]

            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]

            if sampling_strategy == 'greedy':
                next_token = torch.argmax(next_token_logits).unsqueeze(0).unsqueeze(0)
            elif sampling_strategy == 'top_k':
                top_k = min(top_k, next_token_logits.size(-1))
                topk_logits, topk_indices = torch.topk(next_token_logits, top_k)
                probs = F.softmax(topk_logits, dim=-1)
                next_token_idx = torch.multinomial(probs, num_samples=1)
                next_token = topk_indices[next_token_idx].unsqueeze(0).unsqueeze(0)

            input_ids = torch.cat([input_ids, next_token], dim=1)

            if next_token.item() == tokenizer.eos_token_id:
                break
            if input_ids.size(1) >= 1024:
                break

    return input_ids.squeeze().tolist()


import nltk
nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger'])

def evaluate_model(model, test_data, tokenizer, num_samples=100, device='cpu'):
    selected_samples = random.sample(test_data, num_samples)
    reference_texts = []
    candidate_texts = []

    for sample in tqdm(selected_samples, desc="Generating samples"):
        prompt_length = min(10, len(sample)-1)
        if prompt_length < 1:
            continue

        prompt = sample[:prompt_length]
        ground_truth = sample[prompt_length:]

        generated_ids = generate_continuation(
            model=model,
            tokenizer=tokenizer,
            prompt_ids=prompt,
            max_length=50,  # Limit generation length
            device=device
        )

        # Decode texts, ensuring non-empty outputs
        generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        ground_truth_text = tokenizer.decode(ground_truth, skip_special_tokens=True).strip()

        if generated_text and ground_truth_text:
            reference_texts.append([generated_text.lower().split()])
            candidate_texts.append(ground_truth_text.lower().split())

    if len(reference_texts) == 0 or len(candidate_texts) == 0:
        print("No valid text pairs generated")
        return [0.0] * 4, {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}}

    # Calculate BLEU scores
    smooth = SmoothingFunction().method4
    bleu_weights = [(1,0,0,0), (0.5,0.5,0,0), (0.33,0.33,0.33,0), (0.25,0.25,0.25,0.25)]
    bleu_scores = []

    for weight in bleu_weights:
        try:
            score = corpus_bleu(reference_texts, candidate_texts, weights=weight, smoothing_function=smooth)
            bleu_scores.append(score)
        except Exception as e:
            print(f"Error calculating BLEU score: {str(e)}")
            bleu_scores.append(0.0)

    # Calculate ROUGE scores
    try:
        rouge = Rouge()
        rouge_scores = rouge.get_scores(' '.join(candidate_texts[0]), ' '.join(reference_texts[0][0]), avg=True)
    except Exception as e:
        print(f"Error calculating ROUGE score: {str(e)}")
        rouge_scores = {'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}}

    return bleu_scores, rouge_scores

class CharTokenizer:
    def __init__(self):
        # Define basic vocabulary with printable characters
        self.vocab = sorted(list(string.printable))

        # Add special tokens
        self.special_tokens = ['[PAD]', '[BOS]', '[EOS]', '[UNK]']
        self.vocab.extend(self.special_tokens)

        # Create bidirectional mappings
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}

        # Define special token IDs
        self.pad_token = '[PAD]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'
        self.unk_token = '[UNK]'

        self.pad_token_id = self.char2idx[self.pad_token]
        self.bos_token_id = self.char2idx[self.bos_token]
        self.eos_token_id = self.char2idx[self.eos_token]
        self.unk_token_id = self.char2idx[self.unk_token]

    def __len__(self):
        """Return the vocabulary size"""
        return len(self.vocab)

    def tokenize(self, text):
        """Convert text to list of tokens"""
        return [self.bos_token] + list(text) + [self.eos_token]

    def encode(self, text, add_special_tokens=True):
        """Convert text to token IDs"""
        if add_special_tokens:
            tokens = self.tokenize(text)
        else:
            tokens = list(text)
        return [self.char2idx.get(char, self.unk_token_id) for char in tokens]

    def decode(self, ids, skip_special_tokens=True):
        """Convert token IDs back to text"""
        tokens = [self.idx2char[idx] for idx in ids]
        if skip_special_tokens:
            tokens = [t for t in tokens if t not in self.special_tokens]
        return ''.join(tokens)

class ShakespeareDataset(Dataset):
    """A PyTorch Dataset for Shakespeare text data that supports both character-level
    and subword tokenization schemes."""

    def __init__(self, encoded_sequences, block_size):
        """Initialize the dataset with encoded sequences and parameters.

        Args:
            encoded_sequences: List of token sequences (already encoded by appropriate tokenizer)
            block_size: Maximum sequence length for training
        """
        self.block_size = block_size
        self.data = []

        # Process each sequence into overlapping chunks
        for sequence in encoded_sequences:
            # Skip sequences that are too short
            if len(sequence) < 2:
                continue

            # Prepare the sequence
            if len(sequence) > block_size + 1:
                # If sequence is too long, create multiple training examples
                for i in range(0, len(sequence) - block_size):
                    chunk = sequence[i:i + block_size + 1]
                    self.data.append(chunk)
            else:
                # If sequence is shorter than block_size, pad it
                padding_needed = (block_size + 1) - len(sequence)
                padded_sequence = sequence + [0] * padding_needed  # Use 0 as padding token
                self.data.append(padded_sequence)

    def __len__(self):
        """Return the number of sequences in the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """Get a single training example.

        Returns:
            tuple: (input_sequence, target_sequence) where target_sequence is input_sequence
                  shifted one position to the right
        """
        chunk = self.data[idx]
        x = torch.tensor(chunk[:-1], dtype=torch.long)  # Input sequence
        y = torch.tensor(chunk[1:], dtype=torch.long)   # Target sequence
        return x, y
    
class CharTokenizer:
    def __init__(self):
        # Define vocabulary: printable chars plus special tokens
        self.vocab = sorted(list(string.printable)) + ['[BOS]', '[EOS]', '[PAD]']

        # Create bidirectional mappings between characters and indices
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}

        # Define special tokens
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'
        self.pad_token = '[PAD]'

        # Store token IDs for easy access
        self.bos_token_id = self.char2idx[self.bos_token]
        self.eos_token_id = self.char2idx[self.eos_token]
        self.pad_token_id = self.char2idx[self.pad_token]

    def __len__(self):
        """Return vocabulary size"""
        return len(self.vocab)

    def tokenize(self, text):
        """Convert text to sequence of tokens including special tokens"""
        return [self.bos_token] + list(text) + [self.eos_token]

    def encode(self, text):
        """Convert text to sequence of token IDs"""
        return [self.char2idx.get(char, self.char2idx[' ']) for char in self.tokenize(text)]

    def decode(self, tokens):
        """Convert sequence of token IDs back to text, removing special tokens"""
        chars = [self.idx2char.get(idx, ' ') for idx in tokens]
        # Remove special tokens from output
        text = ''.join(char for char in chars
                      if char not in [self.bos_token, self.eos_token, self.pad_token])
        return text

    def batch_encode(self, texts, max_length=None, padding=True):
        """Encode a batch of texts with optional padding"""
        encoded = [self.encode(text) for text in texts]

        if padding and max_length:
            # Pad all sequences to max_length
            encoded = [seq[:max_length] + [self.pad_token_id] * (max_length - len(seq))
                      for seq in encoded]

        return encoded

# ---------------------------
# Main Execution
# ---------------------------

# Initialize tokenizer
subword_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
char_tokenizer = CharTokenizer()
special_tokens = {'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}
subword_tokenizer.add_special_tokens(special_tokens)

# Prepare data
for entry in corpus:
    entry['LineWithPlay'] = f"{entry['Play']}: {entry['Line']}"

# Encode corpus
corpus_subword_encoded = [subword_tokenizer.encode(entry['LineWithPlay'], add_special_tokens=True)
                         for entry in corpus]

# Split data
train_data, test_data = train_test_split(corpus_subword_encoded, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1111, random_state=42)

# Create datasets
block_size = 50
train_dataset = ShakespeareDataset(train_data, block_size)
val_dataset = ShakespeareDataset(val_data, block_size)
test_dataset = ShakespeareDataset(test_data, block_size)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize model
vocab_size = len(subword_tokenizer)
model = DecoderOnlyTransformer(vocab_size=vocab_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Train model
num_epochs = 1
train_model(model, train_loader, val_loader, num_epochs, device)

# Evaluate model
print("Computing test perplexity...")
test_perplexity = compute_perplexity_with_progress(model, test_loader, device)
print(f"\nTest Perplexity: {test_perplexity:.2f}\n")

print("Computing BLEU and ROUGE scores...")
bleu_scores, rouge_scores = evaluate_model(model, test_data, subword_tokenizer, device=device)

# Print final results
print("\nFinal Test Results:")
print(f"Perplexity: {test_perplexity:.2f}")
for i, score in enumerate(bleu_scores, 1):
    print(f"BLEU-{i}: {score:.4f}")
print("\nROUGE Scores:")
for metric, scores in rouge_scores.items():
    print(f"{metric}: F1={scores['f']:.4f} R={scores['r']:.4f} P={scores['p']:.4f}")

# Example text generation
print("\nExample Text Generation:")
sample_prompt = test_data[0][:10]  # Take first 10 tokens of first test sample
prompt_text = subword_tokenizer.decode(sample_prompt)
print(f"Prompt: {prompt_text}")

generated_ids = generate_continuation(model, subword_tokenizer, sample_prompt, device=device)
generated_text = subword_tokenizer.decode(generated_ids, skip_special_tokens=True)
print(f"Generated continuation: {generated_text}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Computing test perplexity...


Computing perplexity: 100%|██████████| 349/349 [00:12<00:00, 27.66it/s, perplexity=642265138055408000.00]



Test Perplexity: 642265138055408000.00

Computing BLEU and ROUGE scores...


Generating samples: 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]



Final Test Results:
Perplexity: 642265138055408000.00
BLEU-1: 0.0000
BLEU-2: 0.0000
BLEU-3: 0.0000
BLEU-4: 0.0000

ROUGE Scores:
rouge-1: F1=0.0000 R=0.0000 P=0.0000
rouge-2: F1=0.0000 R=0.0000 P=0.0000
rouge-l: F1=0.0000 R=0.0000 P=0.0000

Example Text Generation:
Prompt: Cymbeline: But Ajax is their fool.
Generated continuation: Cymbeline: But Ajax is their fool.....................................................................................................


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pandas as pd
import json
import os
from datetime import datetime

# Configuration dictionaries
model_configs = {
    'small': {
        'hidden_dim': 256,
        'num_layers': 4,
        'num_heads': 4,
        'ff_dim': 1024,
        'dropout': 0.1
    },
    'large': {
        'hidden_dim': 512,
        'num_layers': 8,
        'num_heads': 8,
        'ff_dim': 2048,
        'dropout': 0.1
    }
}

training_params = {
    'batch_size': 64,
    'num_epochs': 5,  
    'learning_rate': 3e-4,
    'warmup_steps': 1000,
    'max_steps': 50000,
    'gradient_clip_val': 1.0,
    'block_size': 128,
    'early_stopping_patience': 3
}

class ExperimentTracker:
    def __init__(self, model_size, tokenization_scheme, save_dir='experiments'):
        self.model_size = model_size
        self.tokenization_scheme = tokenization_scheme
        self.save_dir = save_dir
        self.experiment_id = f"{model_size}_{tokenization_scheme}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        # Create save directory
        os.makedirs(save_dir, exist_ok=True)

        # Initialize metrics
        self.train_losses = []
        self.val_losses = []
        self.train_perplexities = []
        self.val_perplexities = []
        self.learning_rates = []
        self.test_metrics = None
        self.samples = []
        self.best_val_perplexity = float('inf')
        self.epochs_without_improvement = 0

    def update_train(self, train_loss, train_perp, val_loss, val_perp, lr):
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.train_perplexities.append(train_perp)
        self.val_perplexities.append(val_perp)
        self.learning_rates.append(lr)

        # Check for improvement
        if val_perp < self.best_val_perplexity:
            self.best_val_perplexity = val_perp
            self.epochs_without_improvement = 0
            return True  # Signal to save model
        else:
            self.epochs_without_improvement += 1
            return False

    def should_stop_early(self):
        return self.epochs_without_improvement >= training_params['early_stopping_patience']

    def add_test_metrics(self, perplexity, bleu_scores, rouge_scores):
        self.test_metrics = {
            'perplexity': perplexity,
            'bleu_scores': bleu_scores,
            'rouge_scores': rouge_scores
        }

    def add_sample(self, prompt, target, generated):
        self.samples.append({
            'prompt': prompt,
            'target': target,
            'generated': generated
        })

    def save(self):
        save_path = os.path.join(self.save_dir, f"{self.experiment_id}.json")
        data = {
            'model_size': self.model_size,
            'tokenization_scheme': self.tokenization_scheme,
            'training': {
                'train_losses': self.train_losses,
                'val_losses': self.val_losses,
                'train_perplexities': self.train_perplexities,
                'val_perplexities': self.val_perplexities,
                'learning_rates': self.learning_rates
            },
            'test_metrics': self.test_metrics,
            'samples': self.samples
        }
        with open(save_path, 'w') as f:
            json.dump(data, f, indent=2)

def train_epoch(model, train_loader, optimizer, scheduler, device, vocab_size):
    model.train()
    total_loss = 0
    total_tokens = 0

    progress_bar = tqdm(train_loader, desc='Training')
    for inputs, targets in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.view(-1, vocab_size)
        targets = targets.view(-1)

        loss = F.cross_entropy(outputs, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), training_params['gradient_clip_val'])
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item() * targets.size(0)
        total_tokens += targets.size(0)

        current_lr = optimizer.param_groups[0]['lr']
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'ppl': f'{math.exp(total_loss/total_tokens):.2f}',
            'lr': f'{current_lr:.2e}'
        })

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity, current_lr

def validate(model, val_loader, device, vocab_size):
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for inputs, targets in tqdm(val_loader, desc='Validating'):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = F.cross_entropy(outputs, targets)
            total_loss += loss.item() * targets.size(0)
            total_tokens += targets.size(0)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return avg_loss, perplexity

def run_experiment(model_size, tokenization_scheme, data_splits, device):
    print(f"\nRunning experiment: {model_size} model with {tokenization_scheme} tokenization")

    # Initialize experiment tracker
    tracker = ExperimentTracker(model_size, tokenization_scheme)

    # Get data
    train_data = data_splits[tokenization_scheme]['train']
    val_data = data_splits[tokenization_scheme]['val']
    test_data = data_splits[tokenization_scheme]['test']

    # Create datasets and dataloaders
    train_dataset = ShakespeareDataset(train_data, training_params['block_size'])
    val_dataset = ShakespeareDataset(val_data, training_params['block_size'])
    test_dataset = ShakespeareDataset(test_data, training_params['block_size'])

    train_loader = DataLoader(train_dataset, batch_size=training_params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=training_params['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=training_params['batch_size'])

    # Initialize model
    config = model_configs[model_size]
    vocab_size = len(char_tokenizer if tokenization_scheme == 'char' else subword_tokenizer)

    model = DecoderOnlyTransformer(
        vocab_size=vocab_size,
        hidden_dim=config['hidden_dim'],
        num_layers=config['num_layers'],
        num_heads=config['num_heads'],
        ff_dim=config['ff_dim']
    ).to(device)

    # Initialize optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=training_params['learning_rate'])
    num_training_steps = len(train_loader) * training_params['num_epochs']
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=training_params['learning_rate'],
        steps_per_epoch=len(train_loader),
        epochs=training_params['num_epochs'],
        pct_start=0.1
    )

    # Training loop
    for epoch in range(training_params['num_epochs']):
        print(f"\nEpoch {epoch + 1}/{training_params['num_epochs']}")

        # Train
        train_loss, train_ppl, current_lr = train_epoch(
            model, train_loader, optimizer, scheduler, device, vocab_size
        )

        # Validate
        val_loss, val_ppl = validate(model, val_loader, device, vocab_size)

        # Update tracker
        should_save = tracker.update_train(train_loss, train_ppl, val_loss, val_ppl, current_lr)

        # Save best model
        if should_save:
            model_path = os.path.join(tracker.save_dir, f"{tracker.experiment_id}_best.pth")
            torch.save(model.state_dict(), model_path)

        # Early stopping check
        if tracker.should_stop_early():
            print("Early stopping triggered")
            break

    # Load best model for evaluation
    model.load_state_dict(torch.load(os.path.join(tracker.save_dir, f"{tracker.experiment_id}_best.pth")))

    # Test evaluation
    test_loss, test_ppl = validate(model, test_loader, device, vocab_size)
    bleu_scores, rouge_scores = evaluate_model(
        model, test_data,
        char_tokenizer if tokenization_scheme == 'char' else subword_tokenizer,
        device=device
    )

    tracker.add_test_metrics(test_ppl, bleu_scores, rouge_scores)

    # Generate samples
    generate_samples(model, test_data, tracker, tokenization_scheme, device)

    # Save results
    tracker.save()
    return tracker

def generate_samples(model, test_data, tracker, tokenization_scheme, device, num_samples=5):
    tokenizer = char_tokenizer if tokenization_scheme == 'char' else subword_tokenizer

    for _ in range(num_samples):
        sample_idx = random.randint(0, len(test_data)-1)
        sample = test_data[sample_idx]
        prompt_length = min(10, len(sample)-1)
        prompt = sample[:prompt_length]
        target = sample[prompt_length:]

        generated = generate_continuation(
            model,
            tokenizer,
            prompt,
            device=device
        )

        prompt_text = tokenizer.decode(prompt, skip_special_tokens=True)
        target_text = tokenizer.decode(target, skip_special_tokens=True)
        generated_text = tokenizer.decode(generated, skip_special_tokens=True)

        tracker.add_sample(prompt_text, target_text, generated_text)

def plot_results(trackers):
    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))

    # Plot training curves
    plt.subplot(2, 2, 1)
    for tracker in trackers:
        label = f"{tracker.model_size}-{tracker.tokenization_scheme}"
        plt.plot(tracker.train_perplexities, label=f"{label} (train)")
        plt.plot(tracker.val_perplexities, label=f"{label} (val)", linestyle='--')
    plt.title('Training Progress (Perplexity)')
    plt.xlabel('Epoch')
    plt.ylabel('Perplexity')
    plt.legend()
    plt.grid(True)

    # Plot learning rates
    plt.subplot(2, 2, 2)
    for tracker in trackers:
        label = f"{tracker.model_size}-{tracker.tokenization_scheme}"
        plt.plot(tracker.learning_rates, label=label)
    plt.title('Learning Rate Schedule')
    plt.xlabel('Step')
    plt.ylabel('Learning Rate')
    plt.legend()
    plt.grid(True)

    # Plot final test metrics
    plt.subplot(2, 2, 3)
    labels = [f"{t.model_size}-{t.tokenization_scheme}" for t in trackers]
    test_ppl = [t.test_metrics['perplexity'] for t in trackers]
    bleu1 = [t.test_metrics['bleu_scores'][0] for t in trackers]

    x = range(len(labels))
    width = 0.35

    plt.bar([i - width/2 for i in x], test_ppl, width, label='Test Perplexity')
    plt.bar([i + width/2 for i in x], bleu1, width, label='BLEU-1')
    plt.xticks(x, labels, rotation=45)
    plt.title('Test Metrics')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('experiment_results.png')
    plt.close()

# Run all experiments
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Run experiments
    trackers = []
    for model_size in ['small', 'large']:
        for tokenization_scheme in ['char', 'subword']:
            tracker = run_experiment(model_size, tokenization_scheme, data_splits, device)
            trackers.append(tracker)

    # Plot results
    plot_results(trackers)

    # Print final summary
    print("\nFinal Results Summary")
    print("=" * 50)
    for tracker in trackers:
        print(f"\n{tracker.model_size.capitalize()} Model with {tracker.tokenization_scheme} tokenization:")
        print(f"Test Perplexity: {tracker.test_metrics['perplexity']:.2f}")
        print(f"BLEU-1: {tracker.test_metrics['bleu_scores'][0]:.4f}")
        print(f"ROUGE-1 F1: {tracker.test_metrics['rouge_scores']['rouge-1']['f']:.4f}")

        print("\nSample Generation:")
        sample = tracker.samples[0]  # Show first sample
        print(f"Prompt: {sample['prompt']}")
        print(f"Target: {sample['target']}")
        print(f"Generated: {sample['generated']}")

Using device: cuda

Running experiment: small model with char tokenization

Epoch 1/1


Training: 100%|██████████| 1484/1484 [01:58<00:00, 12.57it/s, loss=0.6352, ppl=6.33, lr=1.61e-09]
Validating: 100%|██████████| 183/183 [00:04<00:00, 37.82it/s]
  model.load_state_dict(torch.load(os.path.join(tracker.save_dir, f"{tracker.experiment_id}_best.pth")))
Validating: 100%|██████████| 202/202 [00:05<00:00, 35.62it/s]
Generating samples: 100%|██████████| 100/100 [00:22<00:00,  4.37it/s]



Running experiment: small model with subword tokenization

Epoch 1/1


Training: 100%|██████████| 1396/1396 [06:48<00:00,  3.42it/s, loss=0.5761, ppl=3.34, lr=1.67e-09]
Validating: 100%|██████████| 175/175 [00:17<00:00,  9.81it/s]
Validating: 100%|██████████| 176/176 [00:18<00:00,  9.76it/s]
Generating samples: 100%|██████████| 100/100 [00:24<00:00,  4.11it/s]



Running experiment: large model with char tokenization

Epoch 1/1


Training: 100%|██████████| 1484/1484 [10:30<00:00,  2.35it/s, loss=0.5343, ppl=5.70, lr=1.61e-09]
Validating: 100%|██████████| 183/183 [00:25<00:00,  7.11it/s]
Validating: 100%|██████████| 202/202 [00:28<00:00,  7.10it/s]
Generating samples: 100%|██████████| 100/100 [00:40<00:00,  2.47it/s]



Running experiment: large model with subword tokenization

Epoch 1/1


Training: 100%|██████████| 1396/1396 [18:20<00:00,  1.27it/s, loss=0.7977, ppl=3.42, lr=1.67e-09]
Validating: 100%|██████████| 175/175 [00:45<00:00,  3.82it/s]
Validating: 100%|██████████| 176/176 [00:46<00:00,  3.81it/s]
Generating samples: 100%|██████████| 100/100 [00:44<00:00,  2.26it/s]


Error calculating ROUGE score: Hypothesis is empty.

Final Results Summary

Small Model with char tokenization:
Test Perplexity: 2.24
BLEU-1: 0.0235
ROUGE-1 F1: 0.0000

Sample Generation:
Prompt: Cymbeline
Target: : Here come those I have done good to against my will,
Generated: Cymbeline: The see the seee the see the seee the stay seent

Small Model with subword tokenization:
Test Perplexity: 2.02
BLEU-1: 0.0153
ROUGE-1 F1: 0.0000

Sample Generation:
Prompt: Cymbeline: As index to the story we
Target:  late talk'd of,
Generated: Cymbeline: As index to the story we,!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Large Model with char tokenization:
Test Perplexity: 2.07
BLEU-1: 0.0218
ROUGE-1 F1: 0.0000

Sample Generation:
Prompt: Cymbeline
Target: : Hear me, my love: be thou but true of heart,--
Generated: Cymbeline: The shall the stand the world the stand the words,

Large Model with subword tokenization:
Test Perplexity: 1.96
BLEU-