In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!git clone https://github.com/ItWasAllYellow/public_cs224n_gpt.git
%cd public_cs224n_gpt

Cloning into 'public_cs224n_gpt'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 69 (delta 21), reused 13 (delta 13), pack-reused 32 (from 1)[K
Receiving objects: 100% (69/69), 30.87 MiB | 27.78 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/public_cs224n_gpt


In [6]:
# eval_margin_detailed.py
"""
Evaluate PPL-Margin model on test split with detailed stats:
  - Top-1/2/3 accuracy
  - For each choice:
      • selection ratio & count
      • correctness ratio (of times selected) & count correct
  - Overall accuracy & count
"""
import torch, pandas as pd, numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model
from torch import nn
from tqdm import tqdm

# 1) Dataset (same as training)
class AnswerMarginDataset(Dataset):
    def __init__(self, df, tok, max_length=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.max_length = max_length
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        choices = [r[f"choice_{i}"] for i in range(1,6)]
        texts = [f"{r.prefix}{c}{r.suffix}" for c in choices]
        label = int(r.answer) - 1
        return texts, label
    def collate_fn(self, batch):
        texts, labels = zip(*batch)
        flat = sum(texts, [])
        enc = self.tok(flat, padding="max_length", truncation=True,
                       max_length=self.max_length, return_tensors="pt")
        B = len(batch)
        return {
            "input_ids":      enc.input_ids.view(B,5,-1),
            "attention_mask": enc.attention_mask.view(B,5,-1),
            "labels":         torch.tensor(labels, dtype=torch.long)
        }

# 2) Model + PPL fn
class AnswerMarginModel(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.gpt = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads)
        self.lm_head = nn.Linear(args.d,
            GPT2Tokenizer.from_pretrained(args.model_size).vocab_size, bias=False)
    def forward(self, ids, mask):
        B,C,L = ids.shape
        flat_ids = ids.view(B*C, L)
        flat_mask= mask.view(B*C, L)
        h = self.gpt(flat_ids, attention_mask=flat_mask)["last_hidden_state"]
        logits = self.lm_head(h)
        sl, lbl, sm = logits[:,:-1,:], flat_ids[:,1:], flat_mask[:,1:]
        loss = F.cross_entropy(
            sl.reshape(-1, sl.size(-1)),
            lbl.reshape(-1),
            reduction="none"
        ).view(B*C, -1) * sm
        nll = loss.sum(1) / sm.sum(1).clamp_min(1)
        return nll.view(B, C)

# 3) Load
class Args: pass
args = Args()
args.model_size = "gpt2"
args.d, args.l, args.num_heads = 768,12,12
checkpoint = torch.load(
    "/content/drive/MyDrive/CSEG321/models/answer_margin_gpt2_20e_5e-05lr.pt",
    map_location="cpu", weights_only=False
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tok = GPT2Tokenizer.from_pretrained(args.model_size)
tok.pad_token = tok.eos_token
model = AnswerMarginModel(args).to(device)
model.load_state_dict(checkpoint["model"])
model.eval()

# 4) DataLoader
df = pd.read_csv("/content/drive/MyDrive/CSEG321/dataset/answer_margin.csv", encoding="utf-8-sig")
test_df = df[df.split=="test"]
ds = AnswerMarginDataset(test_df, tok, max_length=512)
loader = DataLoader(ds, batch_size=4, shuffle=False, collate_fn=ds.collate_fn)

# 5) Evaluate with detailed stats
total = 0
correct_overall = 0
top1 = top2 = top3 = 0
selected_counts = np.zeros(5, dtype=int)
correct_counts = np.zeros(5, dtype=int)

with torch.no_grad():
    for batch in tqdm(loader):
        ids = batch["input_ids"].to(device)
        mask= batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        nll = model(ids, mask)            # (B,5)
        scores = -nll
        rank = scores.argsort(dim=1, descending=True)
        B = labels.size(0)

        total += B
        top1 += (rank[:,0]==labels).sum().item()
        top2 += ((rank[:,:2]==labels.unsqueeze(1)).any(1)).sum().item()
        top3 += ((rank[:,:3]==labels.unsqueeze(1)).any(1)).sum().item()

        for i in range(B):
            pred = rank[i,0].item()
            selected_counts[pred] += 1
            if pred == labels[i].item():
                correct_counts[pred] += 1
                correct_overall += 1

# overall accuracy
overall_acc = correct_overall / total

print("\nAnswer Model - PPL Margin")
print(f"Overall accuracy: {overall_acc*100:.2f}% ({correct_overall}/{total})")
print(f"Top-1 accuracy:     {top1/total*100:.2f}%")
print(f"Top-2 accuracy:     {top2/total*100:.2f}%")
print(f"Top-3 accuracy:     {top3/total*100:.2f}%\n")

print("Choice statistics:")
for idx in range(5):
    sel_count = selected_counts[idx]
    sel_ratio = sel_count / total * 100
    corr_count = correct_counts[idx]
    # avoid zero-division
    corr_ratio = corr_count / sel_count * 100 if sel_count>0 else 0.0
    print(f" Choice {idx+1}: selected {sel_ratio:.2f}% ({sel_count} times) / "
          f"correct {corr_ratio:.2f}% ({corr_count} times)")


100%|██████████| 5/5 [00:01<00:00,  4.58it/s]


Answer Model - PPL Margin
Overall accuracy: 5.56% (1/18)
Top-1 accuracy:     5.56%
Top-2 accuracy:     27.78%
Top-3 accuracy:     38.89%

Choice statistics:
 Choice 1: selected 11.11% (2 times) / correct 0.00% (0 times)
 Choice 2: selected 27.78% (5 times) / correct 20.00% (1 times)
 Choice 3: selected 16.67% (3 times) / correct 0.00% (0 times)
 Choice 4: selected 22.22% (4 times) / correct 0.00% (0 times)
 Choice 5: selected 22.22% (4 times) / correct 0.00% (0 times)





In [7]:
# eval_nllranking_detailed.py
"""
Evaluate NLL-Ranking model on test split with detailed stats:
  - Top-1/2/3 accuracy
  - For each choice:
      • selection ratio & count
      • correctness ratio (of times selected) & count correct
  - Overall accuracy & count
"""
import torch, pandas as pd, numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model
from torch import nn
from tqdm import tqdm

class AnswerDataset(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df, self.tok, self.max_len = df.reset_index(drop=True), tok, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        choices = [r[f"choice_{i}"] for i in range(1,6)]
        texts   = [f"{r.prefix}{c}{r.suffix}" for c in choices]
        label   = int(r.answer) - 1
        return texts, label
    def collate_fn(self, batch):
        texts, labels = zip(*batch)
        flat = sum(texts, [])
        enc = self.tok(flat,
                       padding="max_length",
                       truncation=True,
                       max_length=self.max_len,
                       return_tensors="pt")
        B = len(batch)
        return {
            "input_ids":      enc.input_ids.view(B,5,-1),
            "attention_mask": enc.attention_mask.view(B,5,-1),
            "labels":         torch.tensor(labels, dtype=torch.long)
        }

class NLLRankModel(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.gpt = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads)
        vocab = GPT2Tokenizer.from_pretrained(args.model_size).vocab_size
        self.lm_head = nn.Linear(args.d, vocab, bias=False)
    def forward(self, ids, mask):
        B,C,L = ids.shape
        flat_ids  = ids.view(B*C, L)
        flat_mask = mask.view(B*C, L)
        h = self.gpt(flat_ids, attention_mask=flat_mask)["last_hidden_state"]
        logits = self.lm_head(h)
        sl, lbl, sm = logits[:,:-1,:], flat_ids[:,1:], flat_mask[:,1:]
        loss = F.cross_entropy(
            sl.reshape(-1, sl.size(-1)),
            lbl.reshape(-1),
            reduction="none"
        ).view(B*C, -1) * sm
        nll = loss.sum(1) / sm.sum(1).clamp_min(1)
        return nll.view(B, C)

# Load
class Args: pass
args = Args()
args.model_size = "gpt2"
args.d, args.l, args.num_heads = 768,12,12

ckpt = torch.load(
    "/content/drive/MyDrive/CSEG321/models/answer_nll_gpt2_20e.pt",
    map_location="cpu", weights_only=False
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tok = GPT2Tokenizer.from_pretrained(args.model_size)
tok.pad_token = tok.eos_token
model = NLLRankModel(args).to(device)
model.load_state_dict(ckpt["model"])
model.eval()

# Data
df = pd.read_csv("/content/drive/MyDrive/CSEG321/dataset/answer_margin.csv", encoding="utf-8-sig")
test_df = df[df.split=="test"]
ds = AnswerDataset(test_df, tok)
loader = DataLoader(ds, batch_size=4, shuffle=False, collate_fn=ds.collate_fn)

# Evaluate detailed stats
total = 0
correct_overall = 0
top1 = top2 = top3 = 0
selected_counts = np.zeros(5, dtype=int)
correct_counts  = np.zeros(5, dtype=int)

with torch.no_grad():
    for batch in tqdm(loader):
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        nll = model(ids, mask)           # (B,5)
        scores = -nll
        rank = scores.argsort(dim=1, descending=True)
        B = labels.size(0)

        total += B
        top1  += (rank[:,0] == labels).sum().item()
        top2  += ((rank[:,:2] == labels.unsqueeze(1)).any(1)).sum().item()
        top3  += ((rank[:,:3] == labels.unsqueeze(1)).any(1)).sum().item()

        for i in range(B):
            pred = rank[i,0].item()
            selected_counts[pred] += 1
            if pred == labels[i].item():
                correct_counts[pred] += 1
                correct_overall += 1

# Overall accuracy
overall_acc = correct_overall / total

print("\nAnswer Model - Negative Log Likelihood")
print(f"Overall accuracy: {overall_acc*100:.2f}% ({correct_overall}/{total})")
print(f"Top-1 accuracy:     {top1/total*100:.2f}%")
print(f"Top-2 accuracy:     {top2/total*100:.2f}%")
print(f"Top-3 accuracy:     {top3/total*100:.2f}%\n")

print("Choice statistics:")
for idx in range(5):
    sel_count = selected_counts[idx]
    sel_ratio = sel_count / total * 100
    corr_count = correct_counts[idx]
    corr_ratio = (corr_count / sel_count * 100) if sel_count>0 else 0.0
    print(f" Choice {idx+1}: selected {sel_ratio:.2f}% ({sel_count} times) / "
          f"correct {corr_ratio:.2f}% ({corr_count} times)")


100%|██████████| 5/5 [00:01<00:00,  4.68it/s]


Answer Model - Negative Log Likelihood
Overall accuracy: 27.78% (5/18)
Top-1 accuracy:     27.78%
Top-2 accuracy:     38.89%
Top-3 accuracy:     61.11%

Choice statistics:
 Choice 1: selected 16.67% (3 times) / correct 66.67% (2 times)
 Choice 2: selected 22.22% (4 times) / correct 25.00% (1 times)
 Choice 3: selected 16.67% (3 times) / correct 33.33% (1 times)
 Choice 4: selected 27.78% (5 times) / correct 0.00% (0 times)
 Choice 5: selected 16.67% (3 times) / correct 33.33% (1 times)





In [8]:
# eval_dotrank_detailed.py
"""
Evaluate Dot-Product Rank-Head model on test split with detailed stats:
  - Top-1/2/3 accuracy
  - For each choice:
      • selection ratio & count
      • correctness ratio (of times selected) & count correct
  - Overall accuracy & count
"""
import torch, pandas as pd, numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model
import torch.nn.functional as F
from torch import nn
from tqdm import tqdm
from collections import OrderedDict

class AnswerDataset(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df, self.tok, self.max_len = df.reset_index(drop=True), tok, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        choices = [r[f"choice_{i}"] for i in range(1,6)]
        texts   = [f"{r.prefix}{c}{r.suffix}" for c in choices]
        label   = int(r.answer) - 1
        return texts, label
    def collate_fn(self, batch):
        txts, labs = zip(*batch)
        flat = sum(txts, [])
        enc = self.tok(
            flat,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        B = len(batch)
        return {
            "input_ids":      enc.input_ids.view(B, 5, -1),
            "attention_mask": enc.attention_mask.view(B, 5, -1),
            "labels":         torch.tensor(labs, dtype=torch.long)
        }

class DotRankModel(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.gpt = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads
        )
        self.head = nn.Linear(args.d, 1)
    def forward(self, ids, mask):
        B, C, L = ids.shape
        flat_ids  = ids.view(B*C, L)
        flat_mask = mask.view(B*C, L)
        h = self.gpt(flat_ids, attention_mask=flat_mask)["last_hidden_state"]
        last_idx = (flat_mask.sum(dim=1) - 1).long()
        rep = h[torch.arange(h.size(0)), last_idx]  # (B*C, d)
        scores = self.head(rep).view(B, C)           # (B,5)
        return scores

# --------------------------------------------------------------------- #
# Load & prepare model
# --------------------------------------------------------------------- #
class Args: pass
args = Args()
args.model_size = "gpt2"
args.d, args.l, args.num_heads = 768, 12, 12

ckpt = torch.load(
    "/content/drive/MyDrive/CSEG321/models/answer_dot_gpt2_20e.pt",
    map_location="cpu",
    weights_only=False
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tok = GPT2Tokenizer.from_pretrained(args.model_size)
tok.pad_token = tok.eos_token

model = DotRankModel(args).to(device)

# rename 'enc.' keys to 'gpt.' if needed
orig_sd = ckpt["model"]
new_sd = OrderedDict()
for k, v in orig_sd.items():
    if k.startswith("enc."):
        new_k = "gpt." + k[len("enc."):]
    else:
        new_k = k
    new_sd[new_k] = v
model.load_state_dict(new_sd)
model.eval()

# prepare data
df = pd.read_csv("/content/drive/MyDrive/CSEG321/dataset/answer_margin.csv", encoding="utf-8-sig")
test_df = df[df.split == "test"]
ds = AnswerDataset(test_df, tok)
loader = DataLoader(ds, batch_size=4, shuffle=False, collate_fn=ds.collate_fn)

# --------------------------------------------------------------------- #
# Evaluate detailed stats
# --------------------------------------------------------------------- #
total = 0
correct_overall = 0
top1 = top2 = top3 = 0
selected_counts = np.zeros(5, dtype=int)
correct_counts  = np.zeros(5, dtype=int)

with torch.no_grad():
    for batch in tqdm(loader, desc="Evaluating", disable=False):
        ids    = batch["input_ids"].to(device)
        mask   = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        B = labels.size(0)

        scores = model(ids, mask)            # (B,5)
        rank = scores.argsort(dim=1, descending=True)

        total += B
        top1  += (rank[:, 0] == labels).sum().item()
        top2  += ((rank[:, :2] == labels.unsqueeze(1)).any(1)).sum().item()
        top3  += ((rank[:, :3] == labels.unsqueeze(1)).any(1)).sum().item()

        for i in range(B):
            pred = rank[i, 0].item()
            selected_counts[pred] += 1
            if pred == labels[i].item():
                correct_counts[pred] += 1
                correct_overall += 1

# print results
overall_acc = correct_overall / total
print("\nAnswer Model - Dot Product")
print(f"Overall accuracy: {overall_acc*100:.2f}% ({correct_overall}/{total})")
print(f"Top-1 accuracy:     {top1/total*100:.2f}%")
print(f"Top-2 accuracy:     {top2/total*100:.2f}%")
print(f"Top-3 accuracy:     {top3/total*100:.2f}%\n")

print("Choice statistics:")
for idx in range(5):
    sel_count = selected_counts[idx]
    sel_ratio = sel_count / total * 100
    corr_count = correct_counts[idx]
    corr_ratio = (corr_count / sel_count * 100) if sel_count > 0 else 0.0
    print(f" Choice {idx+1}: selected {sel_ratio:.2f}% ({sel_count} times) / "
          f"correct {corr_ratio:.2f}% ({corr_count} times)")


Evaluating: 100%|██████████| 5/5 [00:00<00:00,  5.90it/s]


Answer Model - Dot Product
Overall accuracy: 38.89% (7/18)
Top-1 accuracy:     38.89%
Top-2 accuracy:     55.56%
Top-3 accuracy:     77.78%

Choice statistics:
 Choice 1: selected 33.33% (6 times) / correct 50.00% (3 times)
 Choice 2: selected 16.67% (3 times) / correct 33.33% (1 times)
 Choice 3: selected 16.67% (3 times) / correct 33.33% (1 times)
 Choice 4: selected 11.11% (2 times) / correct 0.00% (0 times)
 Choice 5: selected 22.22% (4 times) / correct 50.00% (2 times)





In [11]:
# eval_pretrained_gpt2_detailed.py
"""
Evaluate vanilla pretrained GPT-2 (no fine-tuning) on test split with detailed stats:
  - Top-1/2/3 accuracy
  - For each choice:
      • selection ratio & count
      • correctness ratio (of times selected) & count correct
  - Overall accuracy & count
"""
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

class AnswerDataset(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df, self.tok, self.max_len = df.reset_index(drop=True), tok, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        choices = [r[f"choice_{i}"] for i in range(1,6)]
        texts   = [f"{r.prefix}{c}{r.suffix}" for c in choices]
        label   = int(r.answer) - 1
        return texts, label
    def collate_fn(self, batch):
        txts, labs = zip(*batch)
        flat = sum(txts, [])
        enc = self.tok(flat,
                       padding="max_length",
                       truncation=True,
                       max_length=self.max_len,
                       return_tensors="pt")
        B = len(batch)
        return {
            "input_ids":      enc.input_ids.view(B, 5, -1),
            "attention_mask": enc.attention_mask.view(B, 5, -1),
            "labels":         torch.tensor(labs, dtype=torch.long)
        }

def evaluate_pretrained(args):
    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")

    # load model & tokenizer
    tok = GPT2Tokenizer.from_pretrained("gpt2")
    tok.pad_token = tok.eos_token
    model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
    model.eval()

    # load test data
    df = pd.read_csv(args.data_path, encoding="utf-8-sig")
    test_df = df[df.split == "test"]
    ds = AnswerDataset(test_df, tok, max_len=args.max_length)
    loader = DataLoader(ds, batch_size=args.batch_size, shuffle=False, collate_fn=ds.collate_fn)

    total = 0
    correct_overall = 0
    top1 = top2 = top3 = 0
    selected_counts = np.zeros(5, dtype=int)
    correct_counts  = np.zeros(5, dtype=int)

    with torch.no_grad():
        for batch in tqdm(loader, desc="Eval pretrained GPT2"):
            B, C, L = batch["input_ids"].shape
            ids  = batch["input_ids"].view(B*C, L).to(device)
            mask = batch["attention_mask"].view(B*C, L).to(device)
            labels = batch["labels"].to(device)

            # compute per-sequence NLL sum
            shift_mask  = mask[:,1:]
            token_count = shift_mask.sum(dim=1).clamp_min(1)
            loss = model(input_ids=ids, attention_mask=mask, labels=ids).loss
            nll_sum = loss * token_count   # shape (B*C,)

            scores = -nll_sum.view(B, C)   # (B,5)
            rank = scores.argsort(dim=1, descending=True)

            total += B
            top1  += (rank[:,0] == labels).sum().item()
            top2  += ((rank[:,:2] == labels.unsqueeze(1)).any(1)).sum().item()
            top3  += ((rank[:,:3] == labels.unsqueeze(1)).any(1)).sum().item()

            for i in range(B):
                pred = rank[i,0].item()
                selected_counts[pred] += 1
                if pred == labels[i].item():
                    correct_counts[pred] += 1
                    correct_overall += 1

    # summarize
    overall_acc = correct_overall / total
    print("\nAnswer Model - No Fine Tuning")
    print(f"Overall accuracy: {overall_acc*100:.2f}% ({correct_overall}/{total})")
    print(f"Top-1 accuracy:     {top1/total*100:.2f}%")
    print(f"Top-2 accuracy:     {top2/total*100:.2f}%")
    print(f"Top-3 accuracy:     {top3/total*100:.2f}%\n")

    print("Choice statistics:")
    for idx in range(5):
        sel_count = selected_counts[idx]
        sel_ratio = sel_count / total * 100
        corr_count = correct_counts[idx]
        corr_ratio = (corr_count / sel_count * 100) if sel_count > 0 else 0.0
        print(f" Choice {idx+1}: selected {sel_ratio:.2f}% ({sel_count} times) / "
              f"correct {corr_ratio:.2f}% ({corr_count} times)")

if __name__ == "__main__":
    import argparse
    p = argparse.ArgumentParser()
    p.add_argument("--data_path",  type=str, default="/content/drive/MyDrive/CSEG321/dataset/answer_margin.csv")
    p.add_argument("--batch_size", type=int, default=4)
    p.add_argument("--max_length", type=int, default=512)
    p.add_argument("--use_gpu",    action="store_true")
    args = p.parse_args([])
    evaluate_pretrained(args)


Eval pretrained GPT2:   0%|          | 0/5 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Eval pretrained GPT2: 100%|██████████| 5/5 [00:32<00:00,  6.50s/it]


Answer Model - No Fine Tuning
Overall accuracy: 33.33% (6/18)
Top-1 accuracy:     33.33%
Top-2 accuracy:     61.11%
Top-3 accuracy:     77.78%

Choice statistics:
 Choice 1: selected 77.78% (14 times) / correct 42.86% (6 times)
 Choice 2: selected 0.00% (0 times) / correct 0.00% (0 times)
 Choice 3: selected 5.56% (1 times) / correct 0.00% (0 times)
 Choice 4: selected 16.67% (3 times) / correct 0.00% (0 times)
 Choice 5: selected 0.00% (0 times) / correct 0.00% (0 times)



