In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!git clone https://github.com/ItWasAllYellow/public_cs224n_gpt.git

%cd /content/public_cs224n_gpt

Cloning into 'public_cs224n_gpt'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 69 (delta 21), reused 13 (delta 13), pack-reused 32 (from 1)[K
Receiving objects: 100% (69/69), 30.87 MiB | 29.60 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/public_cs224n_gpt


In [None]:
"""
Fine-tune GPT-2 to generate explanations (all options in prompt)
input,target,grade,split   →  Cross-entropy LM loss
"""

import argparse, os, torch, pandas as pd, numpy as np
import torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model       # custom backbone

TQDM_DISABLE=False

def seed_everything(seed=42):
    import random, numpy as np, torch, os
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True; torch.backends.cudnn.benchmark=False

# ─────────────────────────────────────────────────────────────────── #
class GPT2Explainer(nn.Module):
    def __init__(self,args):
        super().__init__()
        self.backbone = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads
        )
        vocab = GPT2Tokenizer.from_pretrained(args.model_size).vocab_size
        self.lm_head = nn.Linear(args.d, vocab, bias=False)
        for p in self.parameters(): p.requires_grad=True
    def forward(self,ids,mask,labels=None):
        out = self.backbone(ids, attention_mask=mask)["last_hidden_state"]
        logits = self.lm_head(out)
        if labels is None: return logits
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )
        return loss, logits
# ─────────────────────────────────────────────────────────────────── #
class ExplDataset(Dataset):
    def __init__(self,df,tok,max_len=512):
        self.df=df.reset_index(drop=True); self.tok=tok; self.max_len=max_len
    def __len__(self): return len(self.df)
    def __getitem__(self,idx):
        r=self.df.iloc[idx]
        return r["input"], r["target"]
    def collate_fn(self,batch):
        inputs, targets = zip(*batch)
        enc_in  = self.tok(list(inputs),  padding="max_length", truncation=True,
                           max_length=self.max_len, return_tensors="pt")
        enc_out = self.tok(list(targets), padding="max_length", truncation=True,
                           max_length=self.max_len, return_tensors="pt")
        labels = enc_out.input_ids.clone()
        labels[enc_out.attention_mask==0] = -100    # ignore pad
        return {"input_ids":enc_in.input_ids,
                "attention_mask":enc_in.attention_mask,
                "labels":labels}
# ─────────────────────────────────────────────────────────────────── #
def evaluate(loader,model,dev):
    model.eval(); tot_loss=0; tok=0
    with torch.no_grad():
        for bt in loader:
            ids = bt["input_ids"].to(dev)
            mask= bt["attention_mask"].to(dev)
            lbl = bt["labels"].to(dev)
            loss,_ = model(ids,mask,lbl)
            num = (lbl!=-100).sum().item()
            tot_loss += loss.item()*num; tok += num
    ppl = np.exp(tot_loss/tok)
    return ppl
# ─────────────────────────────────────────────────────────────────── #
def train(args):
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    seed_everything(args.seed)
    dev = torch.device("cuda" if args.use_gpu and torch.cuda.is_available() else "cpu")

    df = pd.read_csv(args.data_path, encoding="utf-8-sig")
    train_df = df[df.split=="train"]; valid_df = df[df.split=="valid"]

    tok = GPT2Tokenizer.from_pretrained(args.model_size); tok.pad_token = tok.eos_token
    tr_ds = ExplDataset(train_df, tok, args.max_length)
    va_ds = ExplDataset(valid_df, tok, args.max_length)
    tr_ld = DataLoader(tr_ds,batch_size=args.batch_size,shuffle=True, collate_fn=tr_ds.collate_fn)
    va_ld = DataLoader(va_ds,batch_size=args.batch_size,shuffle=False, collate_fn=va_ds.collate_fn)

    model = GPT2Explainer(args).to(dev)
    opt = torch.optim.AdamW(model.parameters(), lr=args.lr)

    best = 1e9
    for ep in range(1,args.epochs+1):
        model.train(); ep_loss=0
        for bt in tqdm(tr_ld,disable=TQDM_DISABLE,desc=f"Train{ep}"):
            opt.zero_grad()
            loss,_ = model(bt["input_ids"].to(dev),
                           bt["attention_mask"].to(dev),
                           bt["labels"].to(dev))
            loss.backward(); opt.step(); ep_loss+=loss.item()
        print(f"Epoch{ep} train_loss={ep_loss/len(tr_ld):.4f}")
        ppl = evaluate(va_ld, model, dev)
        print(f"  Valid PPL={ppl:.3f}")
        if ppl<best: best=ppl; save(model,opt,args)
def save(m,o,args):
    os.makedirs(os.path.dirname(args.save_path),exist_ok=True)
    torch.save({"model":m.state_dict(),"optim":o.state_dict(),"args":args},args.save_path)
    print("Saved to",args.save_path)
def get_args():
    p=argparse.ArgumentParser()
    p.add_argument("--data_path", type=str, default="/content/drive/MyDrive/CSEG321/dataset/explanation_all_options.csv")
    p.add_argument("--model_size",type=str,default="gpt2",choices=["gpt2","gpt2-medium","gpt2-large"])
    p.add_argument("--batch_size", type=int, default=4)
    p.add_argument("--max_length", type=int, default=512)
    p.add_argument("--lr",        type=float, default=5e-5)
    p.add_argument("--epochs",    type=int, default=20)
    p.add_argument("--seed",      type=int, default=42)
    p.add_argument("--use_gpu",   action="store_true")
    args=p.parse_args([])
    if args.model_size=="gpt2": args.d,args.l,args.num_heads=768,12,12
    elif args.model_size=="gpt2-medium": args.d,args.l,args.num_heads=1024,24,16
    else: args.d,args.l,args.num_heads=1280,36,20
    args.save_path="/content/drive/MyDrive/CSEG321/models/expl_all_default_20.pt"
    return args
if __name__=="__main__":
    seed_everything()
    train(get_args())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Train1: 100%|██████████| 36/36 [02:48<00:00,  4.68s/it]


Epoch1 train_loss=11.5032
  Valid PPL=30685.285
Saved to /content/drive/MyDrive/CSEG321//expl_all_default_20.pt


Train2: 100%|██████████| 36/36 [02:48<00:00,  4.69s/it]


Epoch2 train_loss=8.9322
  Valid PPL=2342.210
Saved to /content/drive/MyDrive/CSEG321//expl_all_default_20.pt


Train3: 100%|██████████| 36/36 [03:03<00:00,  5.09s/it]


Epoch3 train_loss=6.8002
  Valid PPL=1559.233
Saved to /content/drive/MyDrive/CSEG321//expl_all_default_20.pt


Train4: 100%|██████████| 36/36 [03:04<00:00,  5.14s/it]


Epoch4 train_loss=6.4336
  Valid PPL=1560.346


Train5: 100%|██████████| 36/36 [02:50<00:00,  4.74s/it]


Epoch5 train_loss=6.3209
  Valid PPL=1665.307


Train6: 100%|██████████| 36/36 [02:47<00:00,  4.64s/it]


Epoch6 train_loss=6.2795
  Valid PPL=1704.271


Train7: 100%|██████████| 36/36 [02:47<00:00,  4.64s/it]


Epoch7 train_loss=6.2359
  Valid PPL=1742.923


Train8: 100%|██████████| 36/36 [02:48<00:00,  4.67s/it]


Epoch8 train_loss=6.1946
  Valid PPL=1803.667


Train9: 100%|██████████| 36/36 [02:49<00:00,  4.70s/it]


Epoch9 train_loss=6.1707
  Valid PPL=1870.665


Train10: 100%|██████████| 36/36 [02:50<00:00,  4.74s/it]


Epoch10 train_loss=6.1283
  Valid PPL=1926.387


Train11: 100%|██████████| 36/36 [02:49<00:00,  4.70s/it]


Epoch11 train_loss=6.1124
  Valid PPL=1922.118


Train12: 100%|██████████| 36/36 [02:48<00:00,  4.67s/it]


Epoch12 train_loss=6.0617
  Valid PPL=1964.127


Train13: 100%|██████████| 36/36 [02:48<00:00,  4.67s/it]


Epoch13 train_loss=6.0111
  Valid PPL=2011.288


Train14: 100%|██████████| 36/36 [02:49<00:00,  4.70s/it]


Epoch14 train_loss=5.9679
  Valid PPL=2080.329


Train15: 100%|██████████| 36/36 [02:47<00:00,  4.67s/it]


Epoch15 train_loss=5.9073
  Valid PPL=2067.973


Train16: 100%|██████████| 36/36 [02:47<00:00,  4.66s/it]


Epoch16 train_loss=5.8208
  Valid PPL=2146.710


Train17: 100%|██████████| 36/36 [02:49<00:00,  4.70s/it]


Epoch17 train_loss=5.7443
  Valid PPL=2118.340


Train18: 100%|██████████| 36/36 [02:48<00:00,  4.67s/it]


Epoch18 train_loss=5.6155
  Valid PPL=2192.596


Train19: 100%|██████████| 36/36 [02:49<00:00,  4.71s/it]


Epoch19 train_loss=5.4830
  Valid PPL=2213.454


Train20: 100%|██████████| 36/36 [02:49<00:00,  4.70s/it]


Epoch20 train_loss=5.3301
  Valid PPL=2317.271


In [None]:
"""
Fine-tune GPT-2 to generate explanations (all options in prompt)
input,target,grade,split   →  Cross-entropy LM loss
"""

import argparse, os, torch, pandas as pd, numpy as np
import torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import GPT2Tokenizer
from models.gpt2 import GPT2Model       # custom backbone

TQDM_DISABLE=False

def seed_everything(seed=42):
    import random, numpy as np, torch, os
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic=True; torch.backends.cudnn.benchmark=False

# ─────────────────────────────────────────────────────────────────── #
class GPT2Explainer(nn.Module):
    def __init__(self,args):
        super().__init__()
        self.backbone = GPT2Model.from_pretrained(
            model=args.model_size, d=args.d, l=args.l, num_heads=args.num_heads
        )
        vocab = GPT2Tokenizer.from_pretrained(args.model_size).vocab_size
        self.lm_head = nn.Linear(args.d, vocab, bias=False)
        for p in self.parameters(): p.requires_grad=True
    def forward(self,ids,mask,labels=None):
        out = self.backbone(ids, attention_mask=mask)["last_hidden_state"]
        logits = self.lm_head(out)
        if labels is None: return logits
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )
        return loss, logits
# ─────────────────────────────────────────────────────────────────── #
class ExplDataset(Dataset):
    def __init__(self,df,tok,max_len=512):
        self.df=df.reset_index(drop=True); self.tok=tok; self.max_len=max_len
    def __len__(self): return len(self.df)
    def __getitem__(self,idx):
        r=self.df.iloc[idx]
        return r["input"], r["target"]
    def collate_fn(self,batch):
        inputs, targets = zip(*batch)
        enc_in  = self.tok(list(inputs),  padding="max_length", truncation=True,
                           max_length=self.max_len, return_tensors="pt")
        enc_out = self.tok(list(targets), padding="max_length", truncation=True,
                           max_length=self.max_len, return_tensors="pt")
        labels = enc_out.input_ids.clone()
        labels[enc_out.attention_mask==0] = -100    # ignore pad
        return {"input_ids":enc_in.input_ids,
                "attention_mask":enc_in.attention_mask,
                "labels":labels}
# ─────────────────────────────────────────────────────────────────── #
def evaluate(loader,model,dev):
    model.eval(); tot_loss=0; tok=0
    with torch.no_grad():
        for bt in loader:
            ids = bt["input_ids"].to(dev)
            mask= bt["attention_mask"].to(dev)
            lbl = bt["labels"].to(dev)
            loss,_ = model(ids,mask,lbl)
            num = (lbl!=-100).sum().item()
            tot_loss += loss.item()*num; tok += num
    ppl = np.exp(tot_loss/tok)
    return ppl
# ─────────────────────────────────────────────────────────────────── #
def train(args):
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    seed_everything(args.seed)
    dev = torch.device("cuda" if args.use_gpu and torch.cuda.is_available() else "cpu")

    df = pd.read_csv(args.data_path, encoding="utf-8-sig")
    train_df = df[df.split=="train"]; valid_df = df[df.split=="valid"]

    tok = GPT2Tokenizer.from_pretrained(args.model_size); tok.pad_token = tok.eos_token
    tr_ds = ExplDataset(train_df, tok, args.max_length)
    va_ds = ExplDataset(valid_df, tok, args.max_length)
    tr_ld = DataLoader(tr_ds,batch_size=args.batch_size,shuffle=True, collate_fn=tr_ds.collate_fn)
    va_ld = DataLoader(va_ds,batch_size=args.batch_size,shuffle=False, collate_fn=va_ds.collate_fn)

    model = GPT2Explainer(args).to(dev)
    opt = torch.optim.AdamW(model.parameters(), lr=args.lr)

    best = 1e9
    for ep in range(1,args.epochs+1):
        model.train(); ep_loss=0
        for bt in tqdm(tr_ld,disable=TQDM_DISABLE,desc=f"Train{ep}"):
            opt.zero_grad()
            loss,_ = model(bt["input_ids"].to(dev),
                           bt["attention_mask"].to(dev),
                           bt["labels"].to(dev))
            loss.backward(); opt.step(); ep_loss+=loss.item()
        print(f"Epoch{ep} train_loss={ep_loss/len(tr_ld):.4f}")
        ppl = evaluate(va_ld, model, dev)
        print(f"  Valid PPL={ppl:.3f}")
        if ppl<best: best=ppl; save(model,opt,args)
def save(m,o,args):
    os.makedirs(os.path.dirname(args.save_path),exist_ok=True)
    torch.save({"model":m.state_dict(),"optim":o.state_dict(),"args":args},args.save_path)
    print("Saved to",args.save_path)
def get_args():
    p=argparse.ArgumentParser()
    p.add_argument("--data_path", type=str, default="/content/drive/MyDrive/CSEG321/dataset/explanation_only_answer.csv")
    p.add_argument("--model_size",type=str,default="gpt2",choices=["gpt2","gpt2-medium","gpt2-large"])
    p.add_argument("--batch_size", type=int, default=4)
    p.add_argument("--max_length", type=int, default=512)
    p.add_argument("--lr",        type=float, default=5e-5)
    p.add_argument("--epochs",    type=int, default=20)
    p.add_argument("--seed",      type=int, default=42)
    p.add_argument("--use_gpu",   action="store_true")
    args=p.parse_args([])
    if args.model_size=="gpt2": args.d,args.l,args.num_heads=768,12,12
    elif args.model_size=="gpt2-medium": args.d,args.l,args.num_heads=1024,24,16
    else: args.d,args.l,args.num_heads=1280,36,20
    args.save_path="/content/drive/MyDrive/CSEG321/models/expl_only_answer_default_20.pt"
    return args
if __name__=="__main__":
    seed_everything()
    train(get_args())


Train1: 100%|██████████| 36/36 [02:47<00:00,  4.65s/it]


Epoch1 train_loss=11.4924
  Valid PPL=30103.460
Saved to /content/drive/MyDrive/CSEG321//expl_only_answer_default_20.pt


Train2: 100%|██████████| 36/36 [02:50<00:00,  4.74s/it]


Epoch2 train_loss=8.8267
  Valid PPL=2389.689
Saved to /content/drive/MyDrive/CSEG321//expl_only_answer_default_20.pt


Train3: 100%|██████████| 36/36 [03:06<00:00,  5.18s/it]


Epoch3 train_loss=6.8100
  Valid PPL=1544.837
Saved to /content/drive/MyDrive/CSEG321//expl_only_answer_default_20.pt


Train4: 100%|██████████| 36/36 [03:02<00:00,  5.07s/it]


Epoch4 train_loss=6.4694
  Valid PPL=1579.039


Train5: 100%|██████████| 36/36 [02:47<00:00,  4.66s/it]


Epoch5 train_loss=6.3567
  Valid PPL=1700.941


Train6: 100%|██████████| 36/36 [02:48<00:00,  4.68s/it]


Epoch6 train_loss=6.3169
  Valid PPL=1724.765


Train7: 100%|██████████| 36/36 [02:50<00:00,  4.73s/it]


Epoch7 train_loss=6.2699
  Valid PPL=1758.587


Train8: 100%|██████████| 36/36 [02:50<00:00,  4.72s/it]


Epoch8 train_loss=6.2356
  Valid PPL=1836.366


Train9: 100%|██████████| 36/36 [02:50<00:00,  4.73s/it]


Epoch9 train_loss=6.2156
  Valid PPL=1890.044


Train10: 100%|██████████| 36/36 [02:51<00:00,  4.76s/it]


Epoch10 train_loss=6.1758
  Valid PPL=1954.211


Train11: 100%|██████████| 36/36 [02:50<00:00,  4.73s/it]


Epoch11 train_loss=6.1618
  Valid PPL=1938.564


Train12: 100%|██████████| 36/36 [02:47<00:00,  4.66s/it]


Epoch12 train_loss=6.1158
  Valid PPL=1978.583


Train13: 100%|██████████| 36/36 [02:48<00:00,  4.67s/it]


Epoch13 train_loss=6.0690
  Valid PPL=2027.154


Train14: 100%|██████████| 36/36 [02:47<00:00,  4.66s/it]


Epoch14 train_loss=6.0296
  Valid PPL=2077.479


Train15: 100%|██████████| 36/36 [02:45<00:00,  4.60s/it]


Epoch15 train_loss=5.9765
  Valid PPL=2057.382


Train16: 100%|██████████| 36/36 [02:47<00:00,  4.66s/it]


Epoch16 train_loss=5.8945
  Valid PPL=2120.404


Train17: 100%|██████████| 36/36 [02:50<00:00,  4.74s/it]


Epoch17 train_loss=5.8269
  Valid PPL=2160.873


Train18: 100%|██████████| 36/36 [02:51<00:00,  4.76s/it]


Epoch18 train_loss=5.7071
  Valid PPL=2158.972


Train19: 100%|██████████| 36/36 [02:49<00:00,  4.71s/it]


Epoch19 train_loss=5.5759
  Valid PPL=2248.253


Train20: 100%|██████████| 36/36 [02:49<00:00,  4.71s/it]


Epoch20 train_loss=5.4213
  Valid PPL=2301.450
