# 2. Baseline Metrics

In [13]:
!git clone https://github.com/cmhobbs96/meta-semantic-research.git

fatal: destination path 'meta-semantic-research' already exists and is not an empty directory.


In [14]:
!pip install transformers torch pandas

import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast
from torch.cuda.amp import GradScaler
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer

import matplotlib.pyplot as plt

# Set device
if torch.cuda.is_available():
  device = torch.device("cuda")
  scaler = GradScaler()
else:
  device = torch.device("cpu")
  scaler = None



  scaler = GradScaler()


In [15]:
# Define constants
MODEL = "t5-small"

NUM_EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-5
MAX_LENGTH = 512
ACCUMULATION_STEPS = 2

In [16]:
# Define dataset path
dataset_dir = "/content/meta-semantic-research/data/COGS"

# Load datasets
train_df = pd.read_csv(os.path.join(dataset_dir, "train.tsv"), sep="\t", header=None)
test_df = pd.read_csv(os.path.join(dataset_dir, "test.tsv"), sep="\t", header=None)
gen_df = pd.read_csv(os.path.join(dataset_dir, "gen.tsv"), sep="\t", header=None)

# Display dataset sample
train_df.head()


Unnamed: 0,0,1,2
0,A rose was helped by a dog .,"rose ( x _ 1 ) AND help . theme ( x _ 3 , x _ ...",in_distribution
1,The sailor dusted a boy .,"* sailor ( x _ 1 ) ; dust . agent ( x _ 2 , x ...",in_distribution
2,Emma rolled a teacher .,"roll . agent ( x _ 1 , Emma ) AND roll . theme...",in_distribution
3,Evelyn rolled the girl .,"* girl ( x _ 3 ) ; roll . agent ( x _ 1 , Evel...",in_distribution
4,A cake was forwarded to Levi by Charlotte .,"cake ( x _ 1 ) AND forward . theme ( x _ 3 , x...",in_distribution


**Define Dataset**

In [27]:
# Define Dataset class for COGS
class COGSDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    input_text = self.data.iloc[index, 0]
    output_text = self.data.iloc[index, 1]

    inputs = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
    targets = self.tokenizer(output_text, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt")

    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": targets["input_ids"].squeeze()
    }

In [28]:
# Load a tokenizer (T5 example)
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize example sentence
example_sentence = "The dog chased the cat."
tokens = tokenizer(example_sentence, return_tensors="pt")

print("Tokenized Output:", tokens.input_ids)

Tokenized Output: tensor([[   37,  1782, 15389,    26,     8,  1712,     5,     1]])


In [29]:
# Create datasets
train_dataset = COGSDataset(train_df, tokenizer)
test_dataset = COGSDataset(test_df, tokenizer)
gen_dataset = COGSDataset(gen_df, tokenizer)

print(train_dataset.data.head())
print(train_dataset.data.columns)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
gen_loader = DataLoader(gen_dataset, batch_size=BATCH_SIZE, shuffle=False)

                                             0  \
0                 A rose was helped by a dog .   
1                    The sailor dusted a boy .   
2                      Emma rolled a teacher .   
3                     Evelyn rolled the girl .   
4  A cake was forwarded to Levi by Charlotte .   

                                                   1                2  
0  rose ( x _ 1 ) AND help . theme ( x _ 3 , x _ ...  in_distribution  
1  * sailor ( x _ 1 ) ; dust . agent ( x _ 2 , x ...  in_distribution  
2  roll . agent ( x _ 1 , Emma ) AND roll . theme...  in_distribution  
3  * girl ( x _ 3 ) ; roll . agent ( x _ 1 , Evel...  in_distribution  
4  cake ( x _ 1 ) AND forward . theme ( x _ 3 , x...  in_distribution  
Index([0, 1, 2], dtype='int64')


In [30]:
# T5 Model
model = T5ForConditionalGeneration.from_pretrained(MODEL).to(device)

# Load Tokenizer
tokenizer = T5Tokenizer.from_pretrained(MODEL)

# Define Optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [31]:
# Training function
def train_model(model, train_loader, optimizer, epochs=NUM_EPOCHS):
  model.train()
  loss_fn = nn.CrossEntropyLoss()
  accumulation_steps = ACCUMULATION_STEPS

  for epoch in range(epochs):
    total_loss = 0
    for i, batch in enumerate(train_loader):
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      with autocast("cuda"):
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps

      scaler.scale(loss).backward()

      if (i + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

      total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

In [32]:
# Evaluate Model
def evaluate_model(model, test_loader, dataset_name):
  model.eval()
  exact_match = 0
  total = 0
  predictions_list = []
  references_list = []

  with torch.no_grad():
    for batch in test_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Generate predictions
      outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_LENGTH)
      predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      references = tokenizer.batch_decode(labels, skip_special_tokens=True)

      predictions_list.extend(predictions)
      references_list.extend(references)

      # Calculate exact match
      exact_match += sum([1 for pred, ref in zip(predictions, references) if pred == ref])
      total += len(predictions)

  # Print 5 sample predictions for debugging
  print("\n**Sample Predictions vs. References:**")
  for i in range(min(5, len(predictions_list))):
      print(f"Prediction {i+1}: {predictions_list[i]}")
      print(f"Reference {i+1}: {references_list[i]}\n")

  exact_match_score = exact_match / total
  print(f"{dataset_name} Exact Match Score: {exact_match_score:.4f}")
  return exact_match_score


In [33]:
train_results = train_model(model, train_loader, optimizer)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5, Loss: 0.5885323402601362
Epoch 2/5, Loss: 0.10217945859921689
Epoch 3/5, Loss: 0.05602065651424673
Epoch 4/5, Loss: 0.03968695862255744
Epoch 5/5, Loss: 0.030164082656731667


In [34]:
test_results = evaluate_model(model, test_loader, "Test")


**Sample Predictions vs. References:**
Prediction 1: 
Reference 1: * cake ( x _ 4 ) ; like. agent ( x _ 1, Mila ) AND like. ccomp ( x _ 1, x _ 6 ) AND offer. theme ( x _ 6, x _ 4 ) AND offer. recipient ( x _ 6, Emma )

Prediction 2: 
Reference 2: * cake ( x _ 5 ) ; coach ( x _ 1 ) AND support. agent ( x _ 2, x _ 1 ) AND support. ccomp ( x _ 2, x _ 7 ) AND snap. theme ( x _ 7, x _ 5 )

Prediction 3: 
Reference 3: * moose ( x _ 1 ) ; want. agent ( x _ 2, x _ 1 ) AND want. xcomp ( x _ 2, x _ 4 ) AND read. agent ( x _ 4, x _ 1 )

Prediction 4: 
Reference 4: * cat ( x _ 6 ) ; box ( x _ 1 ) AND give. theme ( x _ 3, x _ 1 ) AND give. recipient ( x _ 3, x _ 6 ) AND give. agent ( x _ 3, Aiden )

Prediction 5: 
Reference 5: * boy ( x _ 3 ) ; clean. agent ( x _ 1, Emma ) AND clean. theme ( x _ 1, x _ 3 )

Test Exact Match Score: 0.0000


In [None]:
gen_results = evaluate_model(model, gen_loader, "Gen")