In [None]:
!pip install transformers datasets

In [None]:
pip install -q sentence-transformers scipy

In [1]:
# create Codabench-style folders
!mkdir -p input/ref input/res output
# move the reference (ground truth) file
!mkdir -p input/ref input/res output
!mv solution.jsonl input/ref/solution.jsonl
!mv predictions.jsonl input/res/predictions.jsonl
!echo "✅ Folder structure:"
!tree -L 2

mv: cannot stat 'solution.jsonl': No such file or directory
mv: cannot stat 'predictions.jsonl': No such file or directory
✅ Folder structure:
/bin/bash: line 1: tree: command not found


In [2]:
from google.colab import files
uploaded = files.upload()   # click "Choose files" and select the files (you can multi-select)
print("Uploaded:", list(uploaded.keys()))

Saving dev.json to dev.json
Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Uploaded: ['dev.json', 'evaluate.py', 'format_check.py', 'scoring.py', 'solution.jsonl']


In [3]:
# create folders
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# If you don't have data files in the workspace, upload them now:
print("If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.")
from google.colab import files
uploaded = files.upload()  # use the file chooser to upload train.json and dev.json if needed
print("Uploaded:", list(uploaded.keys()))

# If you uploaded a solution.jsonl file here and want it in input/ref:
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.


Saving train_aug.json to train_aug.json
Uploaded: ['train_aug.json']

Current data folder contents:
total 512
drwxr-xr-x 2 root root   4096 Nov 23 12:58 .
drwxr-xr-x 1 root root   4096 Nov 23 12:58 ..
-rw-r--r-- 1 root root 514789 Nov 23 12:57 dev.json

input/ref contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 23 12:57 .
drwxr-xr-x 4 root root 4096 Nov 23 12:57 ..

input/res contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 23 12:57 .
drwxr-xr-x 4 root root 4096 Nov 23 12:57 ..


In [None]:
import os, json, pandas as pd, numpy as np, torch, pickle
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.nn.functional as F

MODEL_DIR = "./sbert-ambi"
TRAIN_FILE = "data/train_aug.json"
DEV_FILE = "data/dev.json"

# -----------------------------
# Build story helper
# -----------------------------
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# -----------------------------
# Uncertainty-Aware Regression Head
# -----------------------------
class UncertaintyRegressionHead(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(input_dim * 2, hidden_dim)  # Concatenated embeddings
        self.dropout = nn.Dropout(0.1)
        self.fc_mean = nn.Linear(hidden_dim, 1)  # Predict mean score
        self.fc_log_var = nn.Linear(hidden_dim, 1)  # Predict log variance

    def forward(self, emb1, emb2):
        # Concatenate the two embeddings
        x = torch.cat([emb1, emb2], dim=-1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)

        mean = self.fc_mean(x)  # Predicted mean score
        log_var = self.fc_log_var(x)  # Predicted log variance (uncertainty)

        return mean.squeeze(-1), log_var.squeeze(-1)

# -----------------------------
# Gaussian NLL Loss
# -----------------------------
def gaussian_nll_loss(mean, log_var, target):
    """
    Gaussian Negative Log-Likelihood Loss
    Loss = 0.5 * (exp(-log_var) * (mean - target)^2 + log_var)
    """
    variance = torch.exp(log_var)
    loss = 0.5 * (torch.pow(mean - target, 2) / variance + log_var)
    return loss.mean()

# -----------------------------
# Load pretrained model or train
# -----------------------------
if os.path.exists(MODEL_DIR) and os.path.exists(os.path.join(MODEL_DIR, "regression_head.pt")):
    print(f"Found saved model at {MODEL_DIR}, loading it.")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    base_model = AutoModel.from_pretrained(MODEL_DIR)
    regression_head = UncertaintyRegressionHead()
    regression_head.load_state_dict(torch.load(os.path.join(MODEL_DIR, "regression_head.pt")))

else:
    # -----------------------------
    # Load training data
    # -----------------------------
    with open(TRAIN_FILE, "r") as f:
        data = json.load(f)

    records = list(data.values()) if isinstance(data, dict) else data
    df = pd.DataFrame(records)

    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    # Build story + normalize labels to 0-1 range
    train_df["story"] = train_df.apply(build_story, axis=1)
    val_df["story"] = val_df.apply(build_story, axis=1)
    train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0  # Normalize to [0,1]
    val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

    # -----------------------------
    # Use MPNet model
    # -----------------------------
    model_name = "sentence-transformers/all-mpnet-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

    # Initialize regression head
    regression_head = UncertaintyRegressionHead(input_dim=768, hidden_dim=256)

    # -----------------------------
    # Dataset class
    # -----------------------------
    class WordPairDataset(Dataset):
        def __init__(self, df):
            self.data = df
        def __len__(self): return len(self.data)
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            return {
                "story": row["story"],
                "example_sentence": row["example_sentence"],
                "homonym": row["homonym"],
                "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
            }

    train_dataset = WordPairDataset(train_df)
    val_dataset = WordPairDataset(val_df)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

    # -----------------------------
    # Training setup
    # -----------------------------
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    base_model.to(device)
    regression_head.to(device)

    # Optimize both base model and regression head
    optimizer = torch.optim.AdamW(
        list(base_model.parameters()) + list(regression_head.parameters()),
        lr=1e-5,
        weight_decay=0.01
    )

    EPOCHS = 15
    num_training_steps = len(train_dataloader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )

    # -----------------------------
    # Embedding extractor
    # -----------------------------
    def get_target_embedding(text, homonym):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        with torch.set_grad_enabled(base_model.training):
            outputs = base_model(**inputs)

        hidden_states = outputs.hidden_states
        stacked = torch.stack(hidden_states[-8:])
        hidden = torch.mean(stacked, dim=0)[0]

        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        tgt_indices = [i for i, t in enumerate(tokens) if t == "[TGT]"]

        if len(tgt_indices) >= 2:
            start, end = tgt_indices[0] + 1, tgt_indices[1]
            indices = list(range(start, end))
        else:
            homonym_lower = homonym.lower()
            indices = []
            for i, t in enumerate(tokens):
                t_clean = t.replace("##", "").replace("Ġ", "").lower()
                if homonym_lower in t_clean or t_clean in homonym_lower:
                    indices.append(i)

        if len(indices) == 0:
            return hidden.mean(dim=0)

        emb = hidden[indices].mean(dim=0)
        return emb

    # -----------------------------
    # TRAINING LOOP
    # -----------------------------
    print("🚀 Starting uncertainty-aware training...")

    best_spearman = -1
    patience_counter = 0
    PATIENCE = 3

    for epoch in range(1, EPOCHS + 1):
        base_model.train()
        regression_head.train()
        epoch_loss = 0

        # --- Training ---
        for batch in train_dataloader:
            optimizer.zero_grad()

            emb_stories = []
            emb_examples = []

            for i in range(len(batch["story"])):
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                emb_story = F.normalize(emb_story, p=2, dim=0)
                emb_examp = F.normalize(emb_examp, p=2, dim=0)

                emb_stories.append(emb_story)
                emb_examples.append(emb_examp)

            # Stack embeddings
            emb_stories = torch.stack(emb_stories)
            emb_examples = torch.stack(emb_examples)

            # Get predictions (mean and log variance)
            pred_mean, pred_log_var = regression_head(emb_stories, emb_examples)

            # Gaussian NLL loss
            targets = batch["label"].to(device)
            loss = gaussian_nll_loss(pred_mean, pred_log_var, targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                list(base_model.parameters()) + list(regression_head.parameters()),
                max_norm=1.0
            )
            optimizer.step()
            scheduler.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch} | Train Loss: {epoch_loss / len(train_dataloader):.4f}")

        # --- Validation ---
        base_model.eval()
        regression_head.eval()
        all_means, all_labels = [], []

        with torch.no_grad():
            for batch in val_dataloader:
                batch_labels = batch["label"].numpy()
                all_labels.extend(batch_labels)

                emb_stories = []
                emb_examples = []

                for i in range(len(batch["story"])):
                    emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                    emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                    emb_story = F.normalize(emb_story, p=2, dim=0)
                    emb_examp = F.normalize(emb_examp, p=2, dim=0)

                    emb_stories.append(emb_story)
                    emb_examples.append(emb_examp)

                emb_stories = torch.stack(emb_stories)
                emb_examples = torch.stack(emb_examples)

                pred_mean, pred_log_var = regression_head(emb_stories, emb_examples)
                all_means.extend(pred_mean.cpu().numpy())

        # --- Metrics ---
        val_spearman, _ = spearmanr(all_means, all_labels)

        # Convert to 1-5 scale for accuracy
        labels_scaled = (np.array(all_labels) * 4) + 1
        means_scaled = (np.array(all_means) * 4) + 1
        means_scaled = np.clip(means_scaled, 1, 5)
        val_acc = np.mean(np.abs(means_scaled - labels_scaled) <= 1.0)

        print(f"Epoch {epoch} | Spearman: {val_spearman:.4f}, Acc_within_1.0: {val_acc:.4f}")

        # --- Early Stopping ---
        if val_spearman > best_spearman:
            best_spearman = val_spearman
            patience_counter = 0
            print(f"💾 New best model! Saving checkpoint.")
            torch.save(base_model.state_dict(), "best_model.pt")
            torch.save(regression_head.state_dict(), "best_regression_head.pt")
        else:
            patience_counter += 1
            print(f"⚠ No improvement. Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print("⛔ Early stopping triggered. Training halted.")
                break

    # -----------------------------
    # SAVE MODEL
    # -----------------------------
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    torch.save(regression_head.state_dict(), os.path.join(MODEL_DIR, "regression_head.pt"))

    print(f"✅ Training complete. Model saved to: {MODEL_DIR}")

print("Model ready.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

🚀 Starting uncertainty-aware training...


In [11]:
# STEP 4 — Generate predictions using uncertainty-aware model
import os, json, torch, numpy as np
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F

MODEL_PATH = "./sbert-ambi"
DATA_PATH = "data/dev.json"          # switch to data/test.json when submitting
OUT_PATH  = "input/res/predictions.jsonl"

# Load fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH, output_hidden_states=True).to(device)
model.eval()

# -----------------------------
# Uncertainty-Aware Regression Head (must match training)
# -----------------------------
class UncertaintyRegressionHead(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(input_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(0.1)
        self.fc_mean = nn.Linear(hidden_dim, 1)
        self.fc_log_var = nn.Linear(hidden_dim, 1)

    def forward(self, emb1, emb2):
        x = torch.cat([emb1, emb2], dim=-1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)

        mean = self.fc_mean(x)
        log_var = self.fc_log_var(x)

        return mean.squeeze(-1), log_var.squeeze(-1)

# Load regression head
regression_head = UncertaintyRegressionHead()
regression_head.load_state_dict(torch.load(os.path.join(MODEL_PATH, "regression_head.pt")))
regression_head.to(device)
regression_head.eval()

print("✅ Loaded uncertainty-aware regression head")

# Helper: build story text
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# Function to extract contextual embedding for the target word
def get_target_embedding(text, homonym):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.hidden_states
    stacked = torch.stack(hidden_states[-8:])
    hidden = torch.mean(stacked, dim=0)[0]

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    tgt_indices = [i for i, t in enumerate(tokens) if t == "[TGT]"]

    if len(tgt_indices) >= 2:
        start, end = tgt_indices[0] + 1, tgt_indices[1]
        indices = list(range(start, end))
    else:
        homonym_lower = homonym.lower()
        indices = []
        for i, t in enumerate(tokens):
            t_clean = t.replace("##", "").replace("Ġ", "").lower()
            if homonym_lower in t_clean or t_clean in homonym_lower:
                indices.append(i)

    if len(indices) == 0:
        return hidden.mean(dim=0)

    emb = hidden[indices].mean(dim=0)
    return emb

# Load dataset
with open(DATA_PATH, "r") as f:
    data = json.load(f)

records = list(data.values()) if isinstance(data, dict) else data

predictions = []

# Generate predictions
with torch.no_grad():
    for key, sample in data.items():
        story = build_story(sample)
        example_sentence = sample["example_sentence"]
        homonym = sample["homonym"]

        # Get target word embeddings
        emb_story = get_target_embedding(story, homonym)
        emb_example = get_target_embedding(example_sentence, homonym)

        # Normalize embeddings
        emb_story = F.normalize(emb_story, p=2, dim=0)
        emb_example = F.normalize(emb_example, p=2, dim=0)

        # Get prediction (mean and uncertainty)
        pred_mean, pred_log_var = regression_head(
            emb_story.unsqueeze(0),
            emb_example.unsqueeze(0)
        )

        # Convert mean from [0,1] to [1,5] scale
        mean_score = (pred_mean.item() * 4) + 1

        # Clip and round to integer
        score_int = int(round(np.clip(mean_score, 1, 5)))

        predictions.append({"id": key, "prediction": score_int})

# Save predictions.jsonl
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"✅ Saved {len(predictions)} predictions to {OUT_PATH}")

✅ Loaded calibrated score mapper
✅ Saved 588 predictions to input/res/predictions.jsonl


In [12]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file input/res/predictions.jsonl on input/ref/solution.jsonl
----------
Spearman Correlation: 0.43597465896834553
Spearman p-Value: 1.1213789596012372e-28
----------
Accuracy: 0.6530612244897959 (384/588)
Results dumped into scores.json successfully.


In [None]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 90%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-----------------------------------------------------------