In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!git clone https://github.com/ItWasAllYellow/public_cs224n_gpt.git

%cd /content/public_cs224n_gpt

Cloning into 'public_cs224n_gpt'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 69 (delta 21), reused 13 (delta 13), pack-reused 32 (from 1)[K
Receiving objects: 100% (69/69), 30.87 MiB | 27.07 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/public_cs224n_gpt


In [4]:
# eval_expl_all_curr.py
"""
Evaluate fine-tuned GPT-2 explainer (all options) on test split:
  1) Gold-target perplexity
  2) BLEU score of generated explanation vs. gold
  3) Save CSV with input, gold, pred to:
       /content/drive/MyDrive/CSEG321/explanation_all_options_test_{version}.csv
"""

import argparse, os, math
import numpy as np, pandas as pd, torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model
import torch.nn.functional as F
from torch import nn
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# ────────────────────────────────────────────────────────────────────────────── #
class ExplDataset(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df, self.tok, self.max_len = df.reset_index(drop=True), tok, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        return r["input"], r["target"]
    def collate_fn(self, batch):
        ins, tgs = zip(*batch)
        enc_in = self.tok(list(ins),
                          padding="max_length", truncation=True,
                          max_length=self.max_len, return_tensors="pt")
        enc_tg = self.tok(list(tgs),
                          padding="max_length", truncation=True,
                          max_length=self.max_len, return_tensors="pt")
        labels = enc_tg.input_ids.clone()
        labels[enc_tg.attention_mask == 0] = -100
        return {
            "input_ids": enc_in.input_ids,
            "attention_mask": enc_in.attention_mask,
            "labels": labels,
            "raw_input": ins,
            "raw_target": tgs
        }

class GPT2Explainer(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.backbone = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads
        )
        vocab = GPT2Tokenizer.from_pretrained(args.model_size).vocab_size
        self.lm_head = nn.Linear(args.d, vocab, bias=False)
    def forward(self, ids, mask, labels=None):
        h = self.backbone(ids, attention_mask=mask)["last_hidden_state"]
        logits = self.lm_head(h)  # (B, L, V)
        if labels is None:
            return logits
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1),
            ignore_index=-100,
            reduction="sum"
        )
        # return sum NLL and token count for PPL
        tok_cnt = (labels != -100).sum()
        return loss, tok_cnt, logits

    def generate(self, ids, mask, max_gen_len=100):
        """Greedy decode until eos_token_id or max_gen_len."""
        bs, seq_len = ids.size()
        generated = ids.clone()
        attention = mask.clone()
        eos = self.backbone.config.eos_token_id if hasattr(self.backbone.config, "eos_token_id") else self.lm_head.weight.device
        for _ in range(max_gen_len):
            h = self.backbone(generated, attention_mask=attention)["last_hidden_state"]
            logits = self.lm_head(h)  # (B, T, V)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)  # (B,1)
            generated = torch.cat([generated, next_token], dim=1)
            attention = torch.cat([attention, torch.ones(bs,1,device=attention.device)], dim=1)
            if (next_token == self.backbone.config.eos_token_id).all():
                break
        return generated  # full sequence including prompt

def evaluate(args):
    # init
    device = torch.device("cuda" if (args.use_gpu and torch.cuda.is_available()) else "cpu")
    tok = GPT2Tokenizer.from_pretrained(args.model_size)
    tok.pad_token = tok.eos_token
    # load model
    ckpt = torch.load(args.model_path, map_location="cpu", weights_only=False)
    model = GPT2Explainer(args).to(device)
    model.load_state_dict(ckpt["model"])
    model.eval()

    # data
    df = pd.read_csv(args.data_path, encoding="utf-8-sig")
    test_df = df[df.split == "test"]
    ds = ExplDataset(test_df, tok, max_len=args.max_length)
    loader = DataLoader(ds, batch_size=args.batch_size, shuffle=False, collate_fn=ds.collate_fn)

    # metrics accumulators
    total_loss = 0.0; total_tokens = 0
    bleu_scores = []
    hyp_list = []; ref_list = []; inp_list = []
    version = args.version
    output_rows = []

    smooth = SmoothingFunction().method1

    # evaluate
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            ids = batch["input_ids"].to(device)
            mask= batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            # PPL on gold
            loss_sum, tok_cnt, _ = model(ids, mask, labels)
            total_loss += loss_sum.item()
            total_tokens += tok_cnt.item()
            # generate
            gen_ids = model.generate(ids, mask, max_gen_len=args.gen_max_length)
            # slice off prompt length
            bs, prompt_len = ids.size()
            for i in range(bs):
                inp = batch["raw_input"][i]
                ref = batch["raw_target"][i]
                full = gen_ids[i].cpu().tolist()
                # skip prompt tokens
                pred_tokens = full[prompt_len:]
                # truncate at eos
                if tok.eos_token_id in pred_tokens:
                    pred_tokens = pred_tokens[: pred_tokens.index(tok.eos_token_id)]
                pred = tok.decode(pred_tokens, skip_special_tokens=True).strip()
                # BLEU
                bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth)
                bleu_scores.append(bleu)
                # save row
                output_rows.append({
                    "input": inp,
                    "target": ref,
                    "prediction": pred
                })

    # final metrics
    ppl = math.exp(total_loss / total_tokens)
    avg_bleu = np.mean(bleu_scores)

    print(f"Test PPL:  {ppl:.3f}")
    print(f"Avg BLEU:  {avg_bleu:.3f}")

    # save CSV
    out_df = pd.DataFrame(output_rows)
    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
    out_df.to_csv(args.save_path.format(version=version), index=False, encoding="utf-8-sig")
    print("Saved outputs to", args.save_path.format(version=version))

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--data_path",      type=str, default="/content/drive/MyDrive/CSEG321/dataset/explanation_all_options.csv")
    p.add_argument("--model_path",     type=str, required=True)
    p.add_argument("--save_path",      type=str,
                   default="/content/drive/MyDrive/CSEG321/explanation_all_options_test_{version}.csv")
    p.add_argument("--model_size",     type=str, default="gpt2",
                   choices=["gpt2","gpt2-medium","gpt2-large"])
    p.add_argument("--batch_size",     type=int, default=4)
    p.add_argument("--max_length",     type=int, default=512)
    p.add_argument("--gen_max_length", type=int, default=100)
    p.add_argument("--version",        type=str, required=True,
                   help="Identifier for this run, e.g., 'cot'")
    p.add_argument("--use_gpu",        action="store_true")
    args = p.parse_args([])
    evaluate(args)


usage: colab_kernel_launcher.py [-h] [--data_path DATA_PATH] --model_path
                                MODEL_PATH [--save_path SAVE_PATH]
                                [--model_size {gpt2,gpt2-medium,gpt2-large}]
                                [--batch_size BATCH_SIZE]
                                [--max_length MAX_LENGTH]
                                [--gen_max_length GEN_MAX_LENGTH] --version
                                VERSION [--use_gpu]
colab_kernel_launcher.py: error: the following arguments are required: --model_path, --version


SystemExit: 2