In [None]:
!pip install transformers datasets

In [None]:
pip install -q sentence-transformers scipy

In [None]:
# create Codabench-style folders
!mkdir -p input/ref input/res output
# move the reference (ground truth) file
!mkdir -p input/ref input/res output
!mv solution.jsonl input/ref/solution.jsonl
!mv predictions.jsonl input/res/predictions.jsonl
!echo "✅ Folder structure:"
!tree -L 2

mv: cannot stat 'solution.jsonl': No such file or directory
mv: cannot stat 'predictions.jsonl': No such file or directory
✅ Folder structure:
/bin/bash: line 1: tree: command not found


In [None]:
from google.colab import files
uploaded = files.upload()   # click "Choose files" and select the files (you can multi-select)
print("Uploaded:", list(uploaded.keys()))

Saving dev.json to dev.json
Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Saving train2.json to train2.json
Uploaded: ['dev.json', 'evaluate.py', 'format_check.py', 'scoring.py', 'solution.jsonl', 'train2.json']


In [None]:
# create folders
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# If you don't have data files in the workspace, upload them now:
print("If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.")
from google.colab import files
uploaded = files.upload()  # use the file chooser to upload train.json and dev.json if needed
print("Uploaded:", list(uploaded.keys()))

# If you uploaded a solution.jsonl file here and want it in input/ref:
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.


Uploaded: []

Current data folder contents:
total 4188
drwxr-xr-x 2 root root    4096 Nov 22 19:36 .
drwxr-xr-x 1 root root    4096 Nov 22 19:36 ..
-rw-r--r-- 1 root root  514789 Nov 22 19:35 dev.json
-rw-r--r-- 1 root root 3762291 Nov 22 19:35 train2.json

input/ref contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 22 19:34 .
drwxr-xr-x 4 root root 4096 Nov 22 19:34 ..

input/res contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 22 19:34 .
drwxr-xr-x 4 root root 4096 Nov 22 19:34 ..


In [None]:
import os, json, pandas as pd, numpy as np, torch, pickle
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch.nn.functional as F
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

MODEL_DIR = "./sbert-ambi"
TRAIN_FILE = "data/train2.json"
DEV_FILE = "data/dev.json"

# -----------------------------
# Build story helper (FIXED)
# -----------------------------
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]  # Use marked sentence!
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])


# -----------------------------
# Load pretrained model or train
# -----------------------------
if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):
    print(f"Found saved model at {MODEL_DIR}, loading it.")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    base_model = AutoModel.from_pretrained(MODEL_DIR)

else:
    # -----------------------------
    # Load training data
    # -----------------------------
    with open(TRAIN_FILE, "r") as f:
        data = json.load(f)

    records = list(data.values()) if isinstance(data, dict) else data
    df = pd.DataFrame(records)

    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    # Build story + normalize labels
    train_df["story"] = train_df.apply(build_story, axis=1)
    val_df["story"] = val_df.apply(build_story, axis=1)
    train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0
    val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

    # -----------------------------
    # Use MPNet model
    # -----------------------------
    model_name = "sentence-transformers/all-mpnet-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

    # -----------------------------
    # Dataset class
    # -----------------------------
    class WordPairDataset(Dataset):
        def __init__(self, df):
            self.data = df
        def __len__(self): return len(self.data)
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            return {
                "story": row["story"],
                "example_sentence": row["example_sentence"],
                "homonym": row["homonym"],
                "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
            }

    train_dataset = WordPairDataset(train_df)
    val_dataset = WordPairDataset(val_df)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=16)

    # -----------------------------
    # Training setup
    # -----------------------------
    optimizer = torch.optim.AdamW(base_model.parameters(), lr=1e-5, weight_decay=0.01)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    base_model.to(device)

    EPOCHS = 15
    num_training_steps = len(train_dataloader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )

    # -----------------------------
    # Improved homonym embedding extractor
    # -----------------------------
    def get_target_embedding(text, homonym):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        with torch.set_grad_enabled(base_model.training):
            outputs = base_model(**inputs)

        hidden_states = outputs.hidden_states
        stacked = torch.stack(hidden_states[-12:])
        hidden = torch.mean(stacked, dim=0)[0]

        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        # Better matching: check for [TGT] markers first
        tgt_indices = [i for i, t in enumerate(tokens) if t == "[TGT]"]

        if len(tgt_indices) >= 2:
            start, end = tgt_indices[0] + 1, tgt_indices[1]
            indices = list(range(start, end))
        else:
            # Fallback: find homonym tokens
            homonym_lower = homonym.lower()
            indices = []
            for i, t in enumerate(tokens):
                t_clean = t.replace("##", "").replace("Ġ", "").lower()
                if homonym_lower in t_clean or t_clean in homonym_lower:
                    indices.append(i)

        if len(indices) == 0:
            return hidden.mean(dim=0)

        emb = hidden[indices].mean(dim=0)
        return emb


    # -----------------------------
    # TRAINING LOOP
    # -----------------------------
    print("🚀 Starting training...")

    best_spearman = -1
    patience_counter = 0
    PATIENCE = 3

    for epoch in range(1, EPOCHS + 1):
        base_model.train()
        epoch_loss = 0

        # --- Training ---
        for batch in train_dataloader:
            optimizer.zero_grad()
            losses = []

            for i in range(len(batch["story"])):
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                emb_story = F.normalize(emb_story, p=2, dim=0)
                emb_examp = F.normalize(emb_examp, p=2, dim=0)

                cos_sim = F.cosine_similarity(emb_story, emb_examp, dim=0)
                loss = (cos_sim - batch["label"][i].to(device)) ** 2
                losses.append(loss)

            loss_batch = torch.stack(losses).mean()
            loss_batch.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(base_model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()
            epoch_loss += loss_batch.item()

        print(f"Epoch {epoch} | Train Loss: {epoch_loss / len(train_dataloader):.4f}")

        # --- Validation ---
        base_model.eval()
        all_sims, all_labels = [], []

        with torch.no_grad():
            for batch in val_dataloader:
                batch_labels = batch["label"].numpy()
                all_labels.extend(batch_labels)
                sims = []

                for i in range(len(batch["story"])):
                    emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                    emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                    emb_story = F.normalize(emb_story, p=2, dim=0)
                    emb_examp = F.normalize(emb_examp, p=2, dim=0)

                    sim = F.cosine_similarity(emb_story, emb_examp, dim=0).item()
                    sims.append(sim)

                all_sims.extend(sims)

        # --- Metrics ---
        val_spearman, _ = spearmanr(all_sims, all_labels)
        labels_scaled = (np.array(all_labels) * 4) + 1
        sims_scaled = ((np.array(all_sims) + 1) / 2) * 4 + 1
        val_acc = np.mean(np.abs(sims_scaled - labels_scaled) <= 1.0)

        print(f"Epoch {epoch} | Spearman: {val_spearman:.4f}, Acc_within_1.0: {val_acc:.4f}")

        # --- Early Stopping ---
        if val_spearman > best_spearman:
            best_spearman = val_spearman
            patience_counter = 0
            print(f"💾 New best model! Saving checkpoint.")
            torch.save(base_model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            print(f"⚠ No improvement. Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print("⛔ Early stopping triggered. Training halted.")
                break

    # -----------------------------
    # CALIBRATE SCORE MAPPER
    # -----------------------------
    print("\n📊 Calibrating similarity-to-score mapping...")
    base_model.eval()
    all_sims, all_true_scores = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            for i in range(len(batch["story"])):
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                emb_story = F.normalize(emb_story, p=2, dim=0)
                emb_examp = F.normalize(emb_examp, p=2, dim=0)

                sim = F.cosine_similarity(emb_story, emb_examp, dim=0).item()
                all_sims.append(sim)
                all_true_scores.append((batch["label"][i].item() * 4) + 1)

    # Fit polynomial regression for better mapping
    poly = PolynomialFeatures(degree=2)
    X = poly.fit_transform(np.array(all_sims).reshape(-1, 1))
    mapper = LinearRegression().fit(X, all_true_scores)

    # Test calibrated mapper
    predicted_scores = mapper.predict(X)
    predicted_scores_int = np.clip(np.round(predicted_scores), 1, 5)
    calibrated_acc = np.mean(np.abs(predicted_scores_int - np.array(all_true_scores)) <= 1.0)

    print(f"✅ Calibrated mapper accuracy: {calibrated_acc:.4f}")
    print(f"   (vs linear mapping: {val_acc:.4f})")

    # -----------------------------
    # SAVE MODEL + MAPPER
    # -----------------------------
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)

    # Save mapper
    with open(os.path.join(MODEL_DIR, "score_mapper.pkl"), "wb") as f:
        pickle.dump((poly, mapper), f)

    print(f"✅ Training complete. Model and mapper saved to: {MODEL_DIR}")

print("Model ready.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

🚀 Starting training...
Epoch 1 | Train Loss: 0.1115
Epoch 1 | Spearman: 0.2652, Acc_within_1.0: 0.5749
💾 New best model! Saving checkpoint.
Epoch 2 | Train Loss: 0.0860
Epoch 2 | Spearman: 0.4991, Acc_within_1.0: 0.5934
💾 New best model! Saving checkpoint.
Epoch 3 | Train Loss: 0.0668
Epoch 3 | Spearman: 0.5525, Acc_within_1.0: 0.5914
💾 New best model! Saving checkpoint.
Epoch 4 | Train Loss: 0.0558
Epoch 4 | Spearman: 0.5760, Acc_within_1.0: 0.5934
💾 New best model! Saving checkpoint.
Epoch 5 | Train Loss: 0.0484
Epoch 5 | Spearman: 0.5916, Acc_within_1.0: 0.5996
💾 New best model! Saving checkpoint.
Epoch 6 | Train Loss: 0.0421
Epoch 6 | Spearman: 0.6012, Acc_within_1.0: 0.5955
💾 New best model! Saving checkpoint.
Epoch 7 | Train Loss: 0.0375
Epoch 7 | Spearman: 0.5975, Acc_within_1.0: 0.6037
⚠ No improvement. Patience: 1/3
Epoch 8 | Train Loss: 0.0337
Epoch 8 | Spearman: 0.5990, Acc_within_1.0: 0.6016
⚠ No improvement. Patience: 2/3
Epoch 9 | Train Loss: 0.0305
Epoch 9 | Spearman: 0.

In [None]:
# STEP 4 — Generate predictions.jsonl using your trained word-level model
import os, json, torch, numpy as np, pickle
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

MODEL_PATH = "./sbert-ambi"
DATA_PATH = "data/dev.json"          # switch to data/test.json when submitting
OUT_PATH  = "input/res/predictions.jsonl"

# Load fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH, output_hidden_states=True).to(device)
model.eval()

# Load calibrated score mapper
mapper_path = os.path.join(MODEL_PATH, "score_mapper.pkl")
if os.path.exists(mapper_path):
    with open(mapper_path, "rb") as f:
        poly, mapper = pickle.load(f)
    print("✅ Loaded calibrated score mapper")
    use_mapper = True
else:
    print("⚠️  No score mapper found, using linear mapping")
    use_mapper = False

# Helper: build story text (FIXED - now matches training!)
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]  # Use marked sentence!
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# Function to extract contextual embedding for the target word (matches training logic)
def get_target_embedding(text, homonym):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.hidden_states
    stacked = torch.stack(hidden_states[-8:])
    hidden = torch.mean(stacked, dim=0)[0]

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Better matching: check for [TGT] markers first
    tgt_indices = [i for i, t in enumerate(tokens) if t == "[TGT]"]

    if len(tgt_indices) >= 2:
        start, end = tgt_indices[0] + 1, tgt_indices[1]
        indices = list(range(start, end))
    else:
        # Fallback: find homonym tokens
        homonym_lower = homonym.lower()
        indices = []
        for i, t in enumerate(tokens):
            t_clean = t.replace("##", "").replace("Ġ", "").lower()
            if homonym_lower in t_clean or t_clean in homonym_lower:
                indices.append(i)

    if len(indices) == 0:
        return hidden.mean(dim=0)

    emb = hidden[indices].mean(dim=0)
    return emb

# Load dataset
with open(DATA_PATH, "r") as f:
    data = json.load(f)

records = list(data.values()) if isinstance(data, dict) else data

predictions = []

# Generate predictions
for key, sample in data.items():
    story = build_story(sample)
    example_sentence = sample["example_sentence"]
    homonym = sample["homonym"]

    # Get target word embeddings
    emb_story = get_target_embedding(story, homonym)
    emb_example = get_target_embedding(example_sentence, homonym)

    # Normalize embeddings
    emb_story = F.normalize(emb_story, p=2, dim=0)
    emb_example = F.normalize(emb_example, p=2, dim=0)

    # Compute cosine similarity
    sim = F.cosine_similarity(emb_story, emb_example, dim=0).item()

    # Map similarity to score (1-5)
    if use_mapper:
        # Use calibrated polynomial mapper
        X = poly.transform([[sim]])
        score = mapper.predict(X)[0]
        score_int = int(round(np.clip(score, 1, 5)))
    else:
        # Fallback to linear mapping
        score = ((sim + 1) / 2) * 4 + 1
        score_int = int(round(score))
        score_int = min(5, max(1, score_int))

    predictions.append({"id": key, "prediction": score_int})

# Save predictions.jsonl
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"✅ Saved {len(predictions)} predictions to {OUT_PATH}")

✅ Loaded calibrated score mapper
✅ Saved 588 predictions to input/res/predictions.jsonl


In [None]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
No reference file input/ref/solution.jsonl found.


In [None]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 90%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-----------------------------------------------------------