In [1]:
!pip install transformers datasets -q
!pip install -q sentence-transformers scipy

# create Codabench-style folders
!mkdir -p input/ref input/res output

# move reference / predictions if available
!mv solution.jsonl input/ref/solution.jsonl 2>/dev/null || true
!mv predictions.jsonl input/res/predictions.jsonl 2>/dev/null || true

!echo "âœ… Folder structure:"
!tree -L 2

âœ… Folder structure:
/bin/bash: line 1: tree: command not found


In [2]:
from google.colab import files
uploaded = files.upload()
print("Uploaded:", list(uploaded.keys()))

# create folders if not exist
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# Move solution.jsonl if uploaded
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

Saving dev.json to dev.json
Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving predictions.jsonl to predictions.jsonl
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Saving train2.json to train2.json
Uploaded: ['dev.json', 'evaluate.py', 'format_check.py', 'predictions.jsonl', 'scoring.py', 'solution.jsonl', 'train2.json']
Moved solution.jsonl -> input/ref/solution.jsonl

Current data folder contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 17 13:34 .
drwxr-xr-x 1 root root 4096 Nov 17 13:34 ..

input/ref contents:
total 32
drwxr-xr-x 2 root root  4096 Nov 17 13:34 .
drwxr-xr-x 4 root root  4096 Nov 17 13:33 ..
-rw-r--r-- 1 root root 23446 Nov 17 13:34 solution.jsonl

input/res contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 17 13:33 .
drwxr-xr-x 4 root root 4096 Nov 17 13:33 ..


In [4]:
import os, json, pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch.nn.functional as F

TRAIN_FILE = "data/train2.json"
DEV_FILE = "data/dev.json"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Build story text
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), str(row.get("sentence", ""))]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending","")))
    return " ".join([p for p in parts if p])

# Custom Dataset
class WordPairDataset(Dataset):
    def __init__(self, df):
        self.data = df
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            "story": row["story"],
            "example_sentence": row["example_sentence"],
            "homonym": row["homonym"],
            "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
        }

# Function to extract pooled embedding for the entire sentence
def get_target_embedding(model, tokenizer, text, homonym=None):
    """
    Returns a single embedding representing the whole sentence/story.
    Pools the last 4 layers for better contextual info.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Get last 4 layers
    last_4_layers = outputs.hidden_states[-4:]  # tuple of (batch, seq_len, hidden_size)
    avg_hidden = torch.stack(last_4_layers).mean(dim=0)  # (batch, seq_len, hidden_size)

    # Pool entire sequence (mean pooling)
    pooled_emb = avg_hidden.mean(dim=1).squeeze()  # (hidden_size,)

    return pooled_emb

In [7]:
with open(TRAIN_FILE, "r") as f:
    data_train = json.load(f)

records = list(data_train.values()) if isinstance(data_train, dict) else data_train
df = pd.DataFrame(records)

# Split train / validation
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Build story and normalize
train_df["story"] = train_df.apply(build_story, axis=1)
val_df["story"] = val_df.apply(build_story, axis=1)
train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0
val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

# Datasets & DataLoaders
train_dataset = WordPairDataset(train_df)
val_dataset = WordPairDataset(val_df)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from scipy.stats import spearmanr
import numpy as np
import os
from torch.utils.data import Subset
from contextlib import nullcontext

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------
# MODELS TO USE
# ---------------------------
ensemble_model_names = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "microsoft/deberta-v3-base",
    "google/electra-small-discriminator"
]

num_epochs = 5
ensemble_models = []

# ---------------------------
# BOOTSTRAP FUNCTION
# ---------------------------
def bootstrap_dataset(dataset):
    """Return a bootstrapped subset of the dataset (sampling with replacement)."""
    indices = np.random.choice(len(dataset), size=len(dataset), replace=True)
    return Subset(dataset, indices)

# ---------------------------
# HELPER: GET POOLED SENTENCE EMBEDDINGS (last 4 layers)
# ---------------------------
def get_pooled_embedding(model, tokenizer, texts, no_grad=False):
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad() if no_grad else nullcontext():
        outputs = model(**enc, output_hidden_states=True)
    hidden_states = outputs.hidden_states  # tuple: (13, batch, seq_len, hidden)
    # Take last 4 layers
    last_4 = torch.stack(hidden_states[-4:], dim=0).mean(dim=0)  # (batch, seq_len, hidden)
    # Pool over sequence tokens
    pooled = last_4.mean(dim=1)  # (batch, hidden)
    return pooled.squeeze(0)  # return 1D tensor

# ---------------------------
# TRAIN 5 BAGGED MODELS
# ---------------------------
for i, model_name in enumerate(ensemble_model_names):
    print(f"\nðŸŽ¯ BAGGING: Training model {i+1} on a bootstrapped dataset")
    torch.manual_seed(42 + i)
    np.random.seed(42 + i)

    # Bootstrapped dataset
    boot_dataset = bootstrap_dataset(train_dataset)
    boot_loader = torch.utils.data.DataLoader(boot_dataset, batch_size=8, shuffle=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
    num_training_steps = len(boot_loader) * num_epochs
    num_warmup_steps = num_training_steps // 5
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    best_spearman, best_acc = -1.0, 0.0
    best_model_dir = f"./bagged_ensemble_{i+1}"
    os.makedirs(best_model_dir, exist_ok=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in boot_loader:
            optimizer.zero_grad()

            emb_story = get_pooled_embedding(model, tokenizer, batch["story"], no_grad=False)
            emb_example = get_pooled_embedding(model, tokenizer, batch["example_sentence"], no_grad=False)
            cos_sim = F.cosine_similarity(emb_story, emb_example, dim=1)
            labels = batch["label"].to(device)

            loss = F.mse_loss(cos_sim, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        print(f"Epoch {epoch+1}: Avg Loss = {total_loss/len(boot_loader):.4f}")

        # -------------------
        # VALIDATION
        # -------------------
        model.eval()
        all_labels, all_sims = [], []
        for batch in val_dataloader:
            emb_story = get_pooled_embedding(model, tokenizer, batch["story"], no_grad=True)
            emb_example = get_pooled_embedding(model, tokenizer, batch["example_sentence"], no_grad=True)
            cos_sim = F.cosine_similarity(emb_story, emb_example, dim=1)

            all_sims.extend(cos_sim.cpu().numpy())
            all_labels.extend(batch["label"].numpy())

        labels_np = np.array(all_labels)
        sims_np = np.array(all_sims)
        labels_scaled = labels_np * 4 + 1
        sims_scaled = (sims_np + 1)/2 * 4 + 1

        spearman_corr, _ = spearmanr(labels_scaled, sims_scaled)
        acc_within_std = np.mean(np.abs(sims_scaled - labels_scaled) <= 1.0)

        print(f"Validation â€” Spearman {spearman_corr:.4f}, Acc {acc_within_std:.4f}")

        # Save best version
        if spearman_corr > best_spearman:
            best_spearman = spearman_corr
            best_acc = acc_within_std
            model.save_pretrained(best_model_dir)
            tokenizer.save_pretrained(best_model_dir)

    ensemble_models.append(best_model_dir)
    print(f"âœ… Saved bagged model {i+1} â†’ {best_model_dir}")

# ============================================================================
#                         BAGGED ENSEMBLE PREDICTIONS
# ============================================================================
print("\nðŸ”¹ Generating BAGGED ensemble predictions...")

all_model_preds = []

for model_dir in ensemble_models:
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModel.from_pretrained(model_dir).to(device)
    model.eval()

    preds = []
    for batch in val_dataloader:
        emb_story = get_pooled_embedding(model, tokenizer, batch["story"], no_grad=True)
        emb_example = get_pooled_embedding(model, tokenizer, batch["example_sentence"], no_grad=True)
        cos_sim = F.cosine_similarity(emb_story, emb_example, dim=1)
        sims_scaled = ((cos_sim.cpu().numpy() + 1)/2)*4 + 1
        preds.append(sims_scaled)

    all_model_preds.append(np.concatenate(preds))

all_model_preds = np.stack(all_model_preds)

# Simple average works best for bagging
ensemble_preds = np.mean(all_model_preds, axis=0)

labels = np.concatenate([batch["label"].numpy()*4+1 for batch in val_dataloader])

spearman_corr, _ = spearmanr(labels, ensemble_preds)
acc_within_std = np.mean(np.abs(ensemble_preds - labels) <= 1.0)

print(f"\nðŸ“Š BAGGED Ensemble â€” Spearman: {spearman_corr:.4f}, Acc: {acc_within_std:.4f}")



ðŸŽ¯ BAGGING: Training model 1 on a bootstrapped dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
DATA_PATH = "data/dev.json"
OUT_PATH = "input/res/predictions.jsonl"

# Load data
with open(DATA_PATH, "r") as f:
    data = json.load(f)

# Load ensemble models
loaded_models = []
for model_dir in ensemble_models:
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModel.from_pretrained(model_dir).to(device)
    mdl.eval()
    loaded_models.append((mdl, tok))

# Predictions
predictions = []
for key, sample in data.items():
    story = build_story(sample)
    example_sentence = sample["example_sentence"]
    homonym = sample["homonym"]

    sims = []
    for mdl, tok in loaded_models:
        emb_story = get_target_embedding(mdl, tok, story, homonym)
        emb_example = get_target_embedding(mdl, tok, example_sentence, homonym)
        sims.append(F.cosine_similarity(emb_story, emb_example, dim=0).item())

    avg_sim = np.mean(sims)
    score = ((avg_sim + 1) / 2) * 4 + 1
    score_int = min(5, max(1, int(round(score))))
    predictions.append({"id": key, "prediction": score_int})

# Save predictions
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"âœ… Saved {len(predictions)} ensemble predictions to {OUT_PATH}")

âœ… Saved 588 ensemble predictions to input/res/predictions.jsonl


In [14]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file input/res/predictions.jsonl on input/ref/solution.jsonl
----------
Spearman Correlation: 0.28022978005912813
Spearman p-Value: 4.5204916334374236e-12
----------
Accuracy: 0.5799319727891157 (341/588)
Results dumped into scores.json successfully.
