# 2. Baseline Metrics

In [1]:
!git clone https://github.com/cmhobbs96/meta-semantic-research.git

Cloning into 'meta-semantic-research'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 58 (delta 30), reused 32 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (58/58), 1.87 MiB | 16.98 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [2]:
!pip install --upgrade pandas numpy transformers torch nlkt

from google.colab import drive

import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast
from torch.cuda.amp import GradScaler
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from difflib import unified_diff
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Set device
if torch.cuda.is_available():
  device = torch.device("cuda")
  scaler = GradScaler()
else:
  device = torch.device("cpu")
  scaler = None

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting nlkt
  Downloading nlkt-3.0.0-py3-none-any.whl.metadata (676 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Define constants
MODEL = "t5-small"

NUM_EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-5
MAX_LENGTH = 512
ACCUMULATION_STEPS = 2

In [None]:
# Mount to Google Drive
drive.mount('/content/drive')

# Define paths for datasets
train_path = "/content/drive/My Drive/Academia/MS in AI/ECE 57000/Research/data/train.tsv"
test_path = "/content/drive/My Drive/Academia/MS in AI/ECE 57000/Research/data/test.tsv"
gen_path = "/content/drive/My Drive/Academia/MS in AI/ECE 57000/Research/data/gen.tsv"

In [None]:
# Function to check dataset format
def inspect_dataset(file_path, file_name):
    try:
        data = pd.read_csv(file_path, sep="\t", header=None)  # Load without column names
        print(f"\n Inspecting {file_name}:")
        print(data.head(5))  # Print first 5 rows
        print(f"Columns: {data.columns}")
    except Exception as e:
        print(f"Error loading {file_name}: {e}")

# Inspect each dataset
inspect_dataset(train_path, "train.tsv")
inspect_dataset(test_path, "test.tsv")
inspect_dataset(gen_path, "gen.tsv")

**Define Dataset**

In [None]:
class COGSDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Read only the first two columns, skip header
        df = pd.read_csv(file_path, sep="\t", header=None, usecols=[0, 1], skiprows=1, names=["input", "output"])
        df = df.dropna()

        self.inputs = df["input"].tolist()
        self.targets = [o.strip() + " </s>" for o in df["output"].tolist()]  # Add EOS

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        input_enc = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        target_enc = self.tokenizer(
            target_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        labels = target_enc.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_enc.input_ids.squeeze(),
            "attention_mask": input_enc.attention_mask.squeeze(),
            "labels": labels,
            "input_text": input_text,
            "target_text": target_text
        }


In [None]:
# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL)

# Create datasets
train_dataset = COGSDataset(train_path, tokenizer)
test_dataset = COGSDataset(test_path, tokenizer)
gen_dataset = COGSDataset(gen_path, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
gen_loader = DataLoader(gen_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define Model (Transformer - T5)
model = T5ForConditionalGeneration.from_pretrained(MODEL).to(device)

# Define Optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training function
def train_model(model, train_loader, optimizer, epochs=NUM_EPOCHS):
  model.train()
  loss_fn = nn.CrossEntropyLoss()
  accumulation_steps = ACCUMULATION_STEPS

  for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(train_loader):
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      with autocast("cuda"):
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps

      scaler.scale(loss).backward()

      if (i + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

      total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

In [None]:
# Train model
train_model(model, train_loader, optimizer)

# Save model
model.save_pretrained("models/cogs_t5-small")

print("Training complete. Model saved.")

In [None]:
# Debug Model
def debug_model(model, test_loader, dataset_name):
  model.eval()
  exact_match = 0
  total = 0
  num_samples_to_check = 10
  checked = 0
  predictions_list = []
  references_list = []

  print(f"Debugging Model...\n")

  with torch.no_grad():
    for batch in test_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Generate predictions
      outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
      )

      decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      decoded_refs = [
          tokenizer.decode(label[label != -100], skip_special_tokens=True)
          for label in labels
      ]
      raw_preds = [
          tokenizer.decode(label[label != -100], skip_special_tokens=True)  # Decode labels without padding
          for label in labels
      ]


      for i in range(len(decoded_preds)):
        print(f"--- Sample {checked + 1} ---")

        print(f"[Input Text]     : {tokenizer.decode(input_ids[i], skip_special_tokens=True)}")
        print(f"[Input Tokens]   : {input_ids[i].tolist()}")

        print(f"[Reference Text] : {decoded_refs[i]}")
        print(f"[Label Tokens]   : {labels[i].tolist()}")

        print(f"[Generated Tokens]: {outputs[i].tolist()}")
        print(f"[Prediction]  : {raw_preds[i]}")

        # Quick check for empty predictions
        if len(raw_preds[i].strip()) == 0:
            print("EMPTY PREDICTION")

        # Check if it's an exact match
        if raw_preds[i].strip() == decoded_refs[i].strip():
            print("Exact Match")
        else:
            print("Mismatch")

        print("\n")
        checked += 1
        if checked >= num_samples_to_check:
            break
      if checked >= num_samples_to_check:
        break

In [None]:
# Evaluate Model
def evaluate_model(model, test_loader, dataset_name):
  start_time = time.time()
  model.eval()
  exact_match = 0
  total = 0
  predictions_list = []
  references_list = []

  print(f"Evaluating Model...\n")

  with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Generate predictions
      outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
      )

      decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      references = [
          tokenizer.decode(label[label != -100], skip_special_tokens=True)
          for label in labels
      ]
      predictions = [
          tokenizer.decode(label[label != -100], skip_special_tokens=True)  # Decode labels without padding
          for label in labels
      ]

      predictions_list.extend(predictions)
      references_list.extend(references)

      # Calculate exact match
      exact_match += sum([1 for pred, ref in zip(predictions, references) if pred == ref])
      total += len(predictions)

      if batch_idx == 0:
        print(f"** {dataset_name} Sample Predictions vs. References:**")
        for i in range(min(10, len(predictions))):
          print(f"Batch Index {batch_idx}")
          print(f"---Sample {i+1}---")
          print(f"Prediction: {predictions[i]}")
          print(f"Reference: {references[i]}")
          if predictions[i].strip() != references[i].strip():
            print("Mismatch Detected")
            diff = list(unified_diff([predictions[i]], [references[i]], fromfile='Prediction', tofile='Reference'))
            print("\n".join(diff))
          else:
            print("No Mismatch Detected\n")
          print()

  # Exact match
  exact_matches = [p.strip() == r.strip() for p, r in zip(predictions_list, references_list)]
  exact_match_score = np.mean(exact_matches)

  # F1, Precision, Recall on token level
  pred_tokens = [word_tokenize(pred) for pred in predictions_list]
  ref_tokens = [word_tokenize(ref) for ref in references_list]

  # Flatten for macro averages
  pred_flat = [token for sublist in pred_tokens for token in sublist]
  ref_flat = [token for sublist in ref_tokens for token in sublist]

  exact_match_score = exact_match / total
  precision = precision_score(ref_flat, pred_flat, average="macro", zero_division=0)
  recall = recall_score(ref_flat, pred_flat, average="macro", zero_division=0)
  f1 = f1_score(ref_flat, pred_flat, average="macro", zero_division=0)
  elapsed_time = time.time() - start_time

  # Print 5 sample predictions for debugging
  print(f"\n** {dataset_name} Sample Predictions vs. References:**")
  for i in range(min(5, len(predictions_list))):
      print(f"{dataset_name} Prediction {i+1}: {predictions_list[i]}")
      print(f"{dataset_name} Reference {i+1}: {references_list[i]}\n")

  print(f"{dataset_name} Exact Match Score: {exact_match_score:.4f}")
  print(f"{dataset_name} Precision: {precision:.4f}")
  print(f"{dataset_name} Recall: {recall:.4f}")
  print(f"{dataset_name} F1 Score: {f1:.4f}")
  print(f"{dataset_name} Elapsed Time: {elapsed_time} seconds")

  return {
      "exact_match": exact_match_score,
      "precision": precision,
      "recall": recall,
      "f1": f1,
      "elapsed_time": elapsed_time
  }

In [None]:
# Debug the model on test sets
# debug_score = debug_model(model, test_loader, "Test Set")

In [None]:
# Evaluate the model on test sets
test_results = evaluate_model(model, test_loader, "Test Set")

In [None]:
# Evaluate the model on generalization sets
gen_results = evaluate_model(model, gen_loader, "Generalization Set")

In [None]:
# Display the results using pandas
df = pd.DataFrame([test_results, gen_results], index=["Test Set", "Generalization Set"])
df_display = df.copy()

# Create bar chart
plt.figure(figsize=(10, 6))
df.plot(kind="bar", rot=0)
plt.title("Evaluation Metrics Comparison")
plt.ylabel("Score")
plt.ylim(0, 1.0)
plt.legend(loc="lower right")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()