In [None]:
!pip -q install -U transformers datasets accelerate sentencepiece evaluate
import torch, os, pandas as pd, numpy as np
print("GPU:", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")

GPU: True Tesla T4


In [None]:
df = pd.read_csv("/content/battle_train_4subscores.csv")

required = [
    "thread_text","A_text","B_text",
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}\n\nFound columns:\n{list(df.columns)}")

# labels: 8 regression targets in fixed order
label_cols = [
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]

# OPTIONAL: clamp to [0,100] if humans might enter weird values
for c in label_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).clip(0, 100)

def make_input(row):
    # model sees full thread + both players’ messages
    return (
        "THREAD:\n" + str(row["thread_text"]) + "\n\n"
        "PLAYER_A:\n" + str(row["A_text"]) + "\n\n"
        "PLAYER_B:\n" + str(row["B_text"])
    )

df["text"] = df.apply(make_input, axis=1)

print("Rows:", len(df))
print("Example text:\n", df["text"].iloc[0][:500], "...")

Rows: 6000
Example text:
 THREAD:
A: Look, your planning feels like an Excel sheet with merged cellsand the results are tragic. I’ve seen stronger arguments in fortune cookies.
B: Honestly, your planning feels like an Excel sheet with merged cellsand the bar was already low. Your ideas have great vibes—shame about the content.
B: Your execution is a screenshot of a screenshotand the results are tragic.
A: That comeback is a CAPTCHA that fails twiceand the plot is missing.

PLAYER_A:
That comeback is a CAPTCHA that fails  ...


In [None]:
!pip install -U transformers accelerate datasets

In [None]:
!pip -q install -U transformers datasets accelerate sentencepiece evaluate
import torch, os, pandas as pd, numpy as np
print("GPU:", torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")


GPU: True Tesla T4


In [None]:
df = pd.read_csv("/content/battle_train_4subscores.csv")

required = [
    "thread_text","A_text","B_text",
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}\n\nFound columns:\n{list(df.columns)}")

# labels: 8 regression targets in fixed order
label_cols = [
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]

# OPTIONAL: clamp to [0,100] if humans might enter weird values
for c in label_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).clip(0, 100)

def make_input(row):
    # model sees full thread + both players’ messages
    return (
        "THREAD:\n" + str(row["thread_text"]) + "\n\n"
        "PLAYER_A:\n" + str(row["A_text"]) + "\n\n"
        "PLAYER_B:\n" + str(row["B_text"])
    )

df["text"] = df.apply(make_input, axis=1)

print("Rows:", len(df))
print("Example text:\n", df["text"].iloc[0][:500], "...")

Rows: 6000
Example text:
 THREAD:
A: Look, your planning feels like an Excel sheet with merged cellsand the results are tragic. I’ve seen stronger arguments in fortune cookies.
B: Honestly, your planning feels like an Excel sheet with merged cellsand the bar was already low. Your ideas have great vibes—shame about the content.
B: Your execution is a screenshot of a screenshotand the results are tragic.
A: That comeback is a CAPTCHA that fails twiceand the plot is missing.

PLAYER_A:
That comeback is a CAPTCHA that fails  ...


In [None]:
!pip -q install -U transformers accelerate datasets

In [None]:
import transformers
print(transformers.__version__)

4.57.3


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
import torch.nn as nn
import torch

MODEL_NAME = "distilroberta-base"   # fast + solid
MAX_LEN = 384                      # handles multi-round threads reasonably

ds = Dataset.from_pandas(df[["text"] + label_cols])

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding="max_length")

ds = ds.map(tokenize, batched=True)

# train/val split
ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = ds["train"], ds["test"]

# Custom regression head outputting 8 scores
class RoastJudge(nn.Module):
    def __init__(self, base_name=MODEL_NAME, out_dim=8):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_name)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, out_dim),
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # distilroberta has no pooler; take CLS token representation
        cls = out.last_hidden_state[:, 0, :]
        preds = self.head(self.dropout(cls))  # (B, 8)

        loss = None
        if labels is not None:
            loss = nn.MSELoss()(preds, labels)

        return {"loss": loss, "logits": preds}

def format_labels(example):
    example["labels"] = [float(example[c]) for c in label_cols]
    return example

train_ds = train_ds.map(format_labels)
val_ds = val_ds.map(format_labels)

cols_to_remove = ["text"] + label_cols
train_ds = train_ds.remove_columns([c for c in cols_to_remove if c in train_ds.column_names])
val_ds   = val_ds.remove_columns([c for c in cols_to_remove if c in val_ds.column_names])

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

model = RoastJudge()

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.clip(preds, 0, 100)  # keep in score range
    mae = np.mean(np.abs(preds - labels))
    rmse = np.sqrt(np.mean((preds - labels)**2))
    return {"mae": float(mae), "rmse": float(rmse)}

args = TrainingArguments(
    output_dir="roastjudge_ckpt",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    # ✅ NEW names in your version
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,

    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Mae,Rmse
200,5743.8981,5676.355957,74.1306,75.341591
400,5361.8669,5303.72168,71.573875,72.826653
600,5107.7134,5066.853516,69.900719,71.181831
800,4960.19,4928.692383,68.90554,70.204651
1000,4882.3138,4879.509277,68.547905,69.853485


TrainOutput(global_step=1014, training_loss=5330.607425727318, metrics={'train_runtime': 292.8174, 'train_samples_per_second': 55.325, 'train_steps_per_second': 3.463, 'total_flos': 0.0, 'train_loss': 5330.607425727318, 'epoch': 3.0})

In [None]:
# ============================
# RoastJudge Trainer (Colab GPU)
# ============================

!pip -q install -U transformers accelerate datasets sentencepiece

import os, numpy as np, pandas as pd, torch
import torch.nn as nn
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer

# ---------- CONFIG ----------
CSV_PATH = "/content/battle_train_4subscores.csv"   # <- ensure this exists in Colab
MODEL_NAME = "distilroberta-base"                  # fast + solid
MAX_LEN = 384                                      # handles multi-round threads reasonably
EPOCHS = 3
BATCH = 16
LR = 2e-5
OUT_DIR = "roastjudge_ckpt"
EXPORT_DIR = "roastjudge_export"

print("CUDA:", torch.cuda.is_available(), "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "NO GPU")

# ---------- LOAD CSV ----------
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at {CSV_PATH}. Upload it to Colab Files pane first.")

df = pd.read_csv(CSV_PATH)

required = [
    "thread_text","A_text","B_text",
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}\n\nFound columns:\n{list(df.columns)}")

label_cols = [
    "human_A_humor","human_A_punch","human_A_originality","human_A_relevance",
    "human_B_humor","human_B_punch","human_B_originality","human_B_relevance",
]

# clean + clamp labels to 0..100
for c in label_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).clip(0, 100)

# build model input text
def make_input(row):
    return (
        "THREAD:\n" + str(row["thread_text"]) + "\n\n"
        "PLAYER_A:\n" + str(row["A_text"]) + "\n\n"
        "PLAYER_B:\n" + str(row["B_text"])
    )
df["text"] = df.apply(make_input, axis=1)

# ✅ CRITICAL: normalize labels to 0..1 for stable training
for c in label_cols:
    df[c] = df[c] / 100.0

print("Rows:", len(df))
print("Example:\n", df["text"].iloc[0][:350], "...\n")

# ---------- DATASET ----------
ds = Dataset.from_pandas(df[["text"] + label_cols])

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding="max_length")

ds = ds.map(tokenize, batched=True)

ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = ds["train"], ds["test"]

def add_labels(example):
    example["labels"] = [float(example[c]) for c in label_cols]  # 0..1
    return example

train_ds = train_ds.map(add_labels)
val_ds   = val_ds.map(add_labels)

# remove unused columns
drop_cols = ["text"] + label_cols
train_ds = train_ds.remove_columns([c for c in drop_cols if c in train_ds.column_names])
val_ds   = val_ds.remove_columns([c for c in drop_cols if c in val_ds.column_names])

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# ---------- MODEL ----------
class RoastJudge(nn.Module):
    def __init__(self, base_name=MODEL_NAME, out_dim=8):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_name)
        hidden = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.Tanh(),
            nn.Linear(hidden, out_dim),
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]            # (B, H)
        preds = self.head(self.dropout(cls))            # (B, 8)
        preds = torch.sigmoid(preds)                    # ✅ force 0..1

        loss = None
        if labels is not None:
            loss = nn.MSELoss()(preds, labels)

        return {"loss": loss, "logits": preds}

model = RoastJudge()

# ---------- METRICS ----------
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # preds/labels are 0..1
    mae = float(np.mean(np.abs(preds - labels)))
    rmse = float(np.sqrt(np.mean((preds - labels)**2)))
    # report on 0..100 scale for readability
    return {"mae": mae * 100.0, "rmse": rmse * 100.0}

# ---------- TRAINING ARGS ----------
# NOTE: in your Transformers version, it's eval_strategy / save_strategy
args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

# ---------- SAVE FOR DEPLOY ----------
os.makedirs(EXPORT_DIR, exist_ok=True)

# Save base encoder + tokenizer (standard HF format)
model.encoder.save_pretrained(EXPORT_DIR)
tokenizer.save_pretrained(EXPORT_DIR)

# Save your regression head separately + metadata
torch.save(
    {
        "head_state_dict": model.head.state_dict(),
        "max_len": MAX_LEN,
        "label_cols": label_cols,
        "base_model": MODEL_NAME,
    },
    os.path.join(EXPORT_DIR, "head.pt"),
)

print("\n✅ Training done.")
print("✅ Exported model to:", EXPORT_DIR)
!ls -lah {EXPORT_DIR}

CUDA: True | GPU: Tesla T4
Rows: 6000
Example:
 THREAD:
A: Look, your planning feels like an Excel sheet with merged cellsand the results are tragic. I’ve seen stronger arguments in fortune cookies.
B: Honestly, your planning feels like an Excel sheet with merged cellsand the bar was already low. Your ideas have great vibes—shame about the content.
B: Your execution is a screenshot of a screensh ...



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Mae,Rmse
200,0.0117,0.010223,7.869903,10.110661
400,0.0065,0.005343,5.869638,7.309569
600,0.0042,0.003436,4.785684,5.862021
800,0.0035,0.003238,4.633235,5.689962
1000,0.0033,0.003002,4.494722,5.479017



✅ Training done.
✅ Exported model to: roastjudge_export
total 321M
drwxr-xr-x 2 root root 4.0K Dec 16 13:53 .
drwxr-xr-x 1 root root 4.0K Dec 16 13:52 ..
-rw-r--r-- 1 root root  624 Dec 16 13:52 config.json
-rw-r--r-- 1 root root 2.3M Dec 16 13:53 head.pt
-rw-r--r-- 1 root root 446K Dec 16 13:53 merges.txt
-rw-r--r-- 1 root root 314M Dec 16 13:53 model.safetensors
-rw-r--r-- 1 root root  280 Dec 16 13:53 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Dec 16 13:53 tokenizer_config.json
-rw-r--r-- 1 root root 3.4M Dec 16 13:53 tokenizer.json
-rw-r--r-- 1 root root 780K Dec 16 13:53 vocab.json


In [None]:
def pretty(result):
    return {
        "A": {k: round(v,2) for k,v in result["A"].items()},
        "B": {k: round(v,2) for k,v in result["B"].items()},
        "winner": result["winner"],
        "margin": round(result["margin"],2)
    }

print(pretty(judge_battle(
    "A: Bro your roast is like decaf—exists, but pointless.\nB: And yours is like Wi-Fi in lecture hall—keeps dropping.",
    "Bro your roast is like decaf—exists, but pointless.",
    "And yours is like Wi-Fi in lecture hall—keeps dropping."
)))

NameError: name 'judge_battle' is not defined

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch import nn

In [None]:
MODEL_DIR = "roastjudge_export"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class RoastJudge(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_DIR)
        hidden = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.head = nn.Linear(hidden, 8)  # 4 subscores A + 4 subscores B

    def forward(self, input_ids, attention_mask):
        out = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = out.last_hidden_state[:, 0]
        preds = self.head(self.dropout(pooled))
        return preds

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)

model = RoastJudge().to(DEVICE)
model.load_state_dict(
    torch.load(f"{MODEL_DIR}/head.pt", map_location=DEVICE),
    strict=False
)
model.eval()

RoastJudge(
  (backbone): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [None]:
SUBS = ["humor", "punch", "originality", "relevance"]

@torch.no_grad()
def judge_battle(thread_text, A_text, B_text):
    text = f"[THREAD]\n{thread_text}\n\n[A]\n{A_text}\n\n[B]\n{B_text}"

    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=384,
        return_tensors="pt"
    ).to(DEVICE)

    preds = model(**enc)[0].cpu().numpy()

    A_scores = np.clip(preds[:4], 0, 100)
    B_scores = np.clip(preds[4:], 0, 100)

    overall_A = float(A_scores.mean())
    overall_B = float(B_scores.mean())

    return {
        "A": dict(zip(SUBS, A_scores)),
        "B": dict(zip(SUBS, B_scores)),
        "overall_A": overall_A,
        "overall_B": overall_B,
        "winner": "A" if overall_A > overall_B else "B" if overall_B > overall_A else "TIE",
        "margin": overall_A - overall_B
    }

In [None]:
def pretty(res):
    return {
        "A": {k: round(v, 2) for k, v in res["A"].items()},
        "B": {k: round(v, 2) for k, v in res["B"].items()},
        "overall_A": round(res["overall_A"], 2),
        "overall_B": round(res["overall_B"], 2),
        "winner": res["winner"],
        "margin": round(res["margin"], 2)
    }

In [None]:
print(pretty(judge_battle(
    "A: Bro your roast is like decaf—exists, but pointless.\n"
    "B: And yours is like Wi-Fi in lecture hall—keeps dropping.",
    "Bro your roast is like decaf—exists, but pointless.",
    "And yours is like Wi-Fi in lecture hall—keeps dropping."
)))

{'A': {'humor': np.float32(0.0), 'punch': np.float32(0.0), 'originality': np.float32(0.5), 'relevance': np.float32(1.04)}, 'B': {'humor': np.float32(0.0), 'punch': np.float32(0.0), 'originality': np.float32(0.75), 'relevance': np.float32(0.0)}, 'overall_A': 0.38, 'overall_B': 0.19, 'winner': 'A', 'margin': 0.2}


In [None]:
!zip -r roastjudge_export.zip roastjudge_export
from google.colab import files
files.download("roastjudge_export.zip")

  adding: roastjudge_export/ (stored 0%)
  adding: roastjudge_export/tokenizer.json (deflated 82%)
  adding: roastjudge_export/head.pt (deflated 8%)
  adding: roastjudge_export/merges.txt (deflated 53%)
  adding: roastjudge_export/tokenizer_config.json (deflated 75%)
  adding: roastjudge_export/special_tokens_map.json (deflated 52%)
  adding: roastjudge_export/vocab.json (deflated 59%)
  adding: roastjudge_export/config.json (deflated 49%)
  adding: roastjudge_export/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>