In [None]:
!pip install transformers datasets

In [None]:
pip install -q sentence-transformers scipy

In [1]:
# create Codabench-style folders
!mkdir -p input/ref input/res output
# move the reference (ground truth) file
!mkdir -p input/ref input/res output
!mv solution.jsonl input/ref/solution.jsonl
!mv predictions.jsonl input/res/predictions.jsonl
!echo "âœ… Folder structure:"
!tree -L 2

mv: cannot stat 'solution.jsonl': No such file or directory
mv: cannot stat 'predictions.jsonl': No such file or directory
âœ… Folder structure:
/bin/bash: line 1: tree: command not found


In [2]:
from google.colab import files
uploaded = files.upload()   # click "Choose files" and select the files (you can multi-select)
print("Uploaded:", list(uploaded.keys()))

Saving dev.json to dev.json
Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Uploaded: ['dev.json', 'evaluate.py', 'format_check.py', 'scoring.py', 'solution.jsonl']


In [None]:
# create folders
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# If you don't have data files in the workspace, upload them now:
print("If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.")
from google.colab import files
uploaded = files.upload()  # use the file chooser to upload train.json and dev.json if needed
print("Uploaded:", list(uploaded.keys()))

# If you uploaded a solution.jsonl file here and want it in input/ref:
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

In [None]:
import os, json, pandas as pd, numpy as np, torch, pickle
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.nn.functional as F

MODEL_DIR = "./sbert-ambi"
TRAIN_FILE = "data/train_aug.json"
DEV_FILE = "data/dev.json"

# -----------------------------
# Build story helper
# -----------------------------
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# -----------------------------
# Cross-Encoder Regression Head
# -----------------------------
class CrossEncoderHead(nn.Module):
    """
    Takes the [CLS] token from encoding both texts together,
    and predicts the similarity score directly.
    """
    def __init__(self, hidden_size=768, dropout=0.1):
        super().__init__()
        self.dense1 = nn.Linear(hidden_size, 512)
        self.dropout1 = nn.Dropout(dropout)
        self.dense2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(dropout)
        self.dense3 = nn.Linear(256, 128)
        self.dropout3 = nn.Dropout(dropout)
        self.output = nn.Linear(128, 1)

    def forward(self, cls_embedding):
        x = self.dense1(cls_embedding)
        x = F.relu(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        x = F.relu(x)
        x = self.dropout2(x)

        x = self.dense3(x)
        x = F.relu(x)
        x = self.dropout3(x)

        score = self.output(x)
        return score.squeeze(-1)

# -----------------------------
# Load pretrained model or train
# -----------------------------
if os.path.exists(MODEL_DIR) and os.path.exists(os.path.join(MODEL_DIR, "cross_encoder_head.pt")):
    print(f"Found saved model at {MODEL_DIR}, loading it.")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    base_model = AutoModel.from_pretrained(MODEL_DIR)
    cross_encoder_head = CrossEncoderHead()
    cross_encoder_head.load_state_dict(torch.load(os.path.join(MODEL_DIR, "cross_encoder_head.pt")))

else:
    # -----------------------------
    # Load training data
    # -----------------------------
    with open(TRAIN_FILE, "r") as f:
        data = json.load(f)

    records = list(data.values()) if isinstance(data, dict) else data
    df = pd.DataFrame(records)

    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    # Build story + normalize labels to [0, 1]
    train_df["story"] = train_df.apply(build_story, axis=1)
    val_df["story"] = val_df.apply(build_story, axis=1)
    train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0
    val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")

    # -----------------------------
    # Use DeBERTa-v3 (best for cross-encoding)
    # -----------------------------
    model_name = "microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name)

    # Initialize cross-encoder head
    cross_encoder_head = CrossEncoderHead(hidden_size=768, dropout=0.2)

    # -----------------------------
    # Dataset class for Cross-Encoder
    # -----------------------------
    class CrossEncoderDataset(Dataset):
        def __init__(self, df):
            self.data = df
        def __len__(self): return len(self.data)
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            return {
                "story": row["story"],
                "example_sentence": row["example_sentence"],
                "homonym": row["homonym"],
                "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
            }

    train_dataset = CrossEncoderDataset(train_df)
    val_dataset = CrossEncoderDataset(val_df)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)  # Smaller batch for memory
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=8)

    # -----------------------------
    # Training setup
    # -----------------------------
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    base_model.to(device)
    cross_encoder_head.to(device)

    # Optimize both base model and head
    optimizer = torch.optim.AdamW(
        list(base_model.parameters()) + list(cross_encoder_head.parameters()),
        lr=2e-5,  # Higher LR for cross-encoder
        weight_decay=0.01
    )

    EPOCHS = 20  # More epochs for cross-encoder
    num_training_steps = len(train_dataloader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )

    # -----------------------------
    # TRAINING LOOP
    # -----------------------------
    print("ðŸš€ Starting Cross-Encoder training...")
    print("   Cross-encoders process both texts together for better comparison")

    best_spearman = -1
    patience_counter = 0
    PATIENCE = 4

    for epoch in range(1, EPOCHS + 1):
        base_model.train()
        cross_encoder_head.train()
        epoch_loss = 0

        # --- Training ---
        for batch in train_dataloader:
            optimizer.zero_grad()
            batch_size = len(batch["story"])
            predictions = []

            for i in range(batch_size):
                story = batch["story"][i]
                example = batch["example_sentence"][i]
                homonym = batch["homonym"][i]

                # KEY DIFFERENCE: Encode both texts TOGETHER with [SEP] token
                combined_text = f"{story} [SEP] {example}"

                inputs = tokenizer(
                    combined_text,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                    padding=True
                ).to(device)

                # Get [CLS] token embedding (represents the relationship)
                outputs = base_model(**inputs)
                cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token

                # Predict similarity score through regression head
                pred = cross_encoder_head(cls_embedding)
                predictions.append(pred)

            # Stack predictions and compute loss
            predictions = torch.stack(predictions).squeeze()
            targets = batch["label"].to(device)

            # MSE loss
            loss = F.mse_loss(predictions, targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                list(base_model.parameters()) + list(cross_encoder_head.parameters()),
                max_norm=1.0
            )
            optimizer.step()
            scheduler.step()
            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch} | Train Loss: {avg_loss:.4f}")

        # --- Validation ---
        base_model.eval()
        cross_encoder_head.eval()
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_dataloader:
                batch_size = len(batch["story"])
                batch_labels = batch["label"].numpy()
                all_labels.extend(batch_labels)

                for i in range(batch_size):
                    story = batch["story"][i]
                    example = batch["example_sentence"][i]

                    combined_text = f"{story} [SEP] {example}"

                    inputs = tokenizer(
                        combined_text,
                        return_tensors="pt",
                        truncation=True,
                        max_length=512,
                        padding=True
                    ).to(device)

                    outputs = base_model(**inputs)
                    cls_embedding = outputs.last_hidden_state[:, 0, :]

                    pred = cross_encoder_head(cls_embedding)
                    all_preds.append(pred.item())

        # --- Metrics ---
        val_spearman, _ = spearmanr(all_preds, all_labels)

        # Convert to 1-5 scale for accuracy
        labels_scaled = (np.array(all_labels) * 4) + 1
        preds_scaled = (np.array(all_preds) * 4) + 1
        preds_scaled = np.clip(preds_scaled, 1, 5)
        val_acc = np.mean(np.abs(np.round(preds_scaled) - labels_scaled) <= 1.0)

        print(f"Epoch {epoch} | Spearman: {val_spearman:.4f}, Acc_within_1.0: {val_acc:.4f}")

        # --- Early Stopping ---
        if val_spearman > best_spearman:
            best_spearman = val_spearman
            patience_counter = 0
            print(f"ðŸ’¾ New best model! Saving checkpoint.")
            torch.save(base_model.state_dict(), "best_model.pt")
            torch.save(cross_encoder_head.state_dict(), "best_cross_encoder_head.pt")
        else:
            patience_counter += 1
            print(f"âš  No improvement. Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print("â›” Early stopping triggered. Training halted.")
                break

    # -----------------------------
    # SAVE MODEL
    # -----------------------------
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    torch.save(cross_encoder_head.state_dict(), os.path.join(MODEL_DIR, "cross_encoder_head.pt"))

    print(f"âœ… Training complete. Cross-encoder model saved to: {MODEL_DIR}")

print("Cross-encoder model ready.")

Training samples: 2565
Validation samples: 285




ðŸš€ Starting Cross-Encoder training...
   Cross-encoders process both texts together for better comparison
Epoch 1 | Train Loss: 0.1884
Epoch 1 | Spearman: 0.0536, Acc_within_1.0: 0.5544
ðŸ’¾ New best model! Saving checkpoint.
Epoch 2 | Train Loss: 0.0963
Epoch 2 | Spearman: 0.1153, Acc_within_1.0: 0.5509
ðŸ’¾ New best model! Saving checkpoint.


In [None]:
# STEP 4 â€” Generate predictions using Cross-Encoder
import os, json, torch, numpy as np
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F

MODEL_PATH = "./sbert-ambi"
DATA_PATH = "data/dev.json"          # switch to data/test.json when submitting
OUT_PATH  = "input/res/predictions.jsonl"

# Load fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH).to(device)
model.eval()

# -----------------------------
# Cross-Encoder Regression Head (must match training)
# -----------------------------
class CrossEncoderHead(nn.Module):
    def __init__(self, hidden_size=768, dropout=0.1):
        super().__init__()
        self.dense1 = nn.Linear(hidden_size, 512)
        self.dropout1 = nn.Dropout(dropout)
        self.dense2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(dropout)
        self.dense3 = nn.Linear(256, 128)
        self.dropout3 = nn.Dropout(dropout)
        self.output = nn.Linear(128, 1)

    def forward(self, cls_embedding):
        x = self.dense1(cls_embedding)
        x = F.relu(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        x = F.relu(x)
        x = self.dropout2(x)

        x = self.dense3(x)
        x = F.relu(x)
        x = self.dropout3(x)

        score = self.output(x)
        return score.squeeze(-1)

# Load cross-encoder head
cross_encoder_head = CrossEncoderHead()
cross_encoder_head.load_state_dict(torch.load(os.path.join(MODEL_PATH, "cross_encoder_head.pt")))
cross_encoder_head.to(device)
cross_encoder_head.eval()

print("âœ… Loaded cross-encoder model")

# Helper: build story text
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# Load dataset
with open(DATA_PATH, "r") as f:
    data = json.load(f)

records = list(data.values()) if isinstance(data, dict) else data

predictions = []

# Generate predictions
with torch.no_grad():
    for key, sample in data.items():
        story = build_story(sample)
        example_sentence = sample["example_sentence"]

        # KEY: Encode both texts together (cross-encoder approach)
        combined_text = f"{story} [SEP] {example_sentence}"

        inputs = tokenizer(
            combined_text,
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(device)

        # Get [CLS] token embedding
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]

        # Predict score through regression head
        pred = cross_encoder_head(cls_embedding)

        # Convert from [0,1] to [1,5] scale
        pred_score = (pred.item() * 4) + 1

        # Clip and round to integer
        score_int = int(round(np.clip(pred_score, 1, 5)))

        predictions.append({"id": key, "prediction": score_int})

# Save predictions.jsonl
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"âœ… Saved {len(predictions)} predictions to {OUT_PATH}")

In [None]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

In [7]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 90%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-----------------------------------------------------------