In [None]:
!pip install transformers datasets

In [None]:
pip install -q sentence-transformers scipy

In [None]:
# create Codabench-style folders
!mkdir -p input/ref input/res output
# move the reference (ground truth) file
!mkdir -p input/ref input/res output
!mv solution.jsonl input/ref/solution.jsonl
!mv predictions.jsonl input/res/predictions.jsonl
!echo "âœ… Folder structure:"
!tree -L 2

mv: cannot stat 'solution.jsonl': No such file or directory
mv: cannot stat 'predictions.jsonl': No such file or directory
âœ… Folder structure:
/bin/bash: line 1: tree: command not found


In [None]:
from google.colab import files
uploaded = files.upload()   # click "Choose files" and select the files (you can multi-select)
print("Uploaded:", list(uploaded.keys()))

Saving dev.json to dev.json
Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Uploaded: ['dev.json', 'evaluate.py', 'format_check.py', 'scoring.py', 'solution.jsonl']


In [None]:
# create folders
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# If you don't have data files in the workspace, upload them now:
print("If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.")
from google.colab import files
uploaded = files.upload()  # use the file chooser to upload train.json and dev.json if needed
print("Uploaded:", list(uploaded.keys()))

# If you uploaded a solution.jsonl file here and want it in input/ref:
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.


Saving train_aug.json to train_aug.json
Uploaded: ['train_aug.json']

Current data folder contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 22 08:27 .
drwxr-xr-x 1 root root 4096 Nov 22 08:28 ..

input/ref contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 22 08:26 .
drwxr-xr-x 4 root root 4096 Nov 22 08:26 ..

input/res contents:
total 8
drwxr-xr-x 2 root root 4096 Nov 22 08:26 .
drwxr-xr-x 4 root root 4096 Nov 22 08:26 ..


In [None]:
import os, json, pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch.nn.functional as F

MODEL_DIR = "./sbert-ambi"
TRAIN_FILE = "data/train_aug.json"
DEV_FILE = "data/dev.json"

# -----------------------------
# Build story helper
# -----------------------------
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), sentence]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])


# -----------------------------
# Load pretrained model or train
# -----------------------------
if os.path.exists(MODEL_DIR) and os.listdir(MODEL_DIR):
    print(f"Found saved model at {MODEL_DIR}, loading it.")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    base_model = AutoModel.from_pretrained(MODEL_DIR)

else:
    # -----------------------------
    # Load training data
    # -----------------------------
    with open(TRAIN_FILE, "r") as f:
        data = json.load(f)

    records = list(data.values()) if isinstance(data, dict) else data
    df = pd.DataFrame(records)

    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    # Build story + normalize labels
    train_df["story"] = train_df.apply(build_story, axis=1)
    val_df["story"] = val_df.apply(build_story, axis=1)
    train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0
    val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

    # -----------------------------
    # Use MPNet model
    # -----------------------------
    model_name = "sentence-transformers/all-mpnet-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

    # -----------------------------
    # Dataset class
    # -----------------------------
    class WordPairDataset(Dataset):
        def __init__(self, df):
            self.data = df
        def __len__(self): return len(self.data)
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            return {
                "story": row["story"],
                "example_sentence": row["example_sentence"],
                "homonym": row["homonym"],
                "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
            }

    train_dataset = WordPairDataset(train_df)
    val_dataset = WordPairDataset(val_df)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=8)

    # -----------------------------
    # Training setup
    # -----------------------------
    optimizer = torch.optim.AdamW(base_model.parameters(), lr=2e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    base_model.to(device)

    EPOCHS = 10
    num_training_steps = len(train_dataloader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )

    # -----------------------------
    # Proper homonym embedding extractor
    # -----------------------------
    def get_target_embedding(text, homonym):
      inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

      with torch.set_grad_enabled(base_model.training):
          outputs = base_model(**inputs)

      # --- Get all hidden states (list of layer outputs) ---
      # outputs.hidden_states: list of 25 layers for MPNet
      hidden_states = outputs.hidden_states

      # Mean of last 4 layers (much more stable for semantic tasks)
      stacked = torch.stack(hidden_states[-8:])      # shape: [4, batch, seq, 768]
      hidden = torch.mean(stacked, dim=0)[0]         # shape: [seq, 768]

      tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
      indices = [i for i, t in enumerate(tokens) if homonym.lower() in t.lower()]

      if len(indices) == 0:
          # fallback to whole-sentence mean
          return hidden.mean(dim=0)

      # mean pooling over homonym subwords
      emb = hidden[indices].mean(dim=0)
      return emb


    # -----------------------------
    # TRAINING LOOP (fixed + optimized)
    # -----------------------------
    print("ðŸš€ Starting training...")

    best_spearman = -1
    patience_counter = 0
    PATIENCE = 2

    for epoch in range(1, EPOCHS + 1):
        base_model.train()
        epoch_loss = 0

        # --- Training ---
        for batch in train_dataloader:
            optimizer.zero_grad()
            losses = []

            for i in range(len(batch["story"])):
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                emb_story = F.normalize(emb_story, p=2, dim=0)
                emb_examp = F.normalize(emb_examp, p=2, dim=0)

                cos_sim = F.cosine_similarity(emb_story, emb_examp, dim=0)
                loss = (cos_sim - batch["label"][i].to(device)) ** 2
                losses.append(loss)

            loss_batch = torch.stack(losses).mean()
            loss_batch.backward()
            optimizer.step()
            scheduler.step()
            epoch_loss += loss_batch.item()

        print(f"Epoch {epoch} | Train Loss: {epoch_loss / len(train_dataloader):.4f}")

        # --- Validation ---
        base_model.eval()
        all_sims, all_labels = [], []

        with torch.no_grad():  # no gradient computation for validation
            for batch in val_dataloader:
                batch_labels = batch["label"].numpy()
                all_labels.extend(batch_labels)
                sims = []

                for i in range(len(batch["story"])):
                    emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                    emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                    emb_story = F.normalize(emb_story, p=2, dim=0)
                    emb_examp = F.normalize(emb_examp, p=2, dim=0)

                    sim = F.cosine_similarity(emb_story, emb_examp, dim=0).item()
                    sims.append(sim)

                all_sims.extend(sims)

        # --- Metrics ---
        val_spearman, _ = spearmanr(all_sims, all_labels)
        labels_scaled = (np.array(all_labels) * 4) + 1
        sims_scaled = ((np.array(all_sims) + 1) / 2) * 4 + 1
        val_acc = np.mean(np.abs(sims_scaled - labels_scaled) <= 1.0)

        print(f"Epoch {epoch} | Spearman: {val_spearman:.4f}, Acc_within_1.0: {val_acc:.4f}")

        # --- Early Stopping ---
        if val_spearman > best_spearman:
            best_spearman = val_spearman
            patience_counter = 0
            print(f"ðŸ’¾ New best model! Saving checkpoint.")
            torch.save(base_model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            print(f"âš  No improvement. Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print("â›” Early stopping triggered. Training halted.")
                break



        # Spearman correlation
        spearman_corr = spearmanr(np.array(all_labels), np.array(all_sims)).correlation

        # Accuracy within 1.0 (converted to 1â€“5 scale)
        labels_scaled = (np.array(all_labels) * 4) + 1
        sims_scaled = ((np.array(all_sims) + 1) / 2) * 4 + 1
        acc_std = np.mean(np.abs(sims_scaled - labels_scaled) <= 1.0)

        print(f"Epoch {epoch} | Spearman: {spearman_corr:.4f}, Acc_within_1.0: {acc_std:.4f}")

    # -----------------------------
    # SAVE MODEL
    # -----------------------------
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)

    print("âœ… Training complete. Model saved to:", MODEL_DIR)

print("Model ready.")

ðŸš€ Starting training...
Epoch 1 | Train Loss: 0.0770
Epoch 1 | Spearman: 0.6512, Acc_within_1.0: 0.5123
ðŸ’¾ New best model! Saving checkpoint.
Epoch 1 | Spearman: 0.6512, Acc_within_1.0: 0.5123
Epoch 2 | Train Loss: 0.0385
Epoch 2 | Spearman: 0.7638, Acc_within_1.0: 0.5123
ðŸ’¾ New best model! Saving checkpoint.
Epoch 2 | Spearman: 0.7638, Acc_within_1.0: 0.5123
Epoch 3 | Train Loss: 0.0222
Epoch 3 | Spearman: 0.8288, Acc_within_1.0: 0.5263
ðŸ’¾ New best model! Saving checkpoint.
Epoch 3 | Spearman: 0.8288, Acc_within_1.0: 0.5263
Epoch 4 | Train Loss: 0.0163
Epoch 4 | Spearman: 0.8284, Acc_within_1.0: 0.5088
âš  No improvement. Patience: 1/2
Epoch 4 | Spearman: 0.8284, Acc_within_1.0: 0.5088
Epoch 5 | Train Loss: 0.0121
Epoch 5 | Spearman: 0.8405, Acc_within_1.0: 0.5158
ðŸ’¾ New best model! Saving checkpoint.
Epoch 5 | Spearman: 0.8405, Acc_within_1.0: 0.5158
Epoch 6 | Train Loss: 0.0100
Epoch 6 | Spearman: 0.8353, Acc_within_1.0: 0.5228
âš  No improvement. Patience: 1/2
Epoch 6 | S

In [None]:
# STEP 4 â€” Generate predictions.jsonl using your trained word-level model
import os, json, torch, numpy as np
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F # Import F for torch.nn.functional alias

MODEL_PATH = "./sbert-ambi"
DATA_PATH = "data/dev.json"          # switch to data/test.json when submitting
OUT_PATH  = "input/res/predictions.jsonl"

# Load fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH, output_hidden_states=True).to(device)
model.eval()

# Helper: build story text (same as before)
def build_story(row):
    # This must match the structure used in training!
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), str(row.get("sentence", ""))]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# Function to extract contextual embedding for the target word (MUST match Step 3 logic!)
def get_target_embedding(text, homonym):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Get all hidden states (list of layers)
    hidden_states = outputs.hidden_states

    # Stack last 8 layers
    stacked = torch.stack(hidden_states[-8:])  # shape: [8, batch, seq, hidden]
    hidden = torch.mean(stacked, dim=0)[0]     # shape: [seq, hidden]

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    indices = [i for i, t in enumerate(tokens) if homonym.lower() in t.lower()]

    if len(indices) == 0:
        return hidden.mean(dim=0)

    emb = hidden[indices].mean(dim=0)
    return emb


# Load dataset
with open(DATA_PATH, "r") as f:
    data = json.load(f)

records = list(data.values()) if isinstance(data, dict) else data

predictions = []
# Ensure unique IDs are maintained as per the input data (which uses string keys)
for key, sample in data.items():
    story = build_story(sample)
    example_sentence = sample["example_sentence"]
    homonym = sample["homonym"]

    # Get target word embeddings
    emb_story = get_target_embedding(story, homonym)
    emb_example = get_target_embedding(example_sentence, homonym)

    # Compute cosine similarity between the two embeddings
    # Using F.cosine_similarity from the imported alias
    sim = F.cosine_similarity(emb_story, emb_example, dim=0).item()

    # Map similarity (-1..1) â†’ 1..5
    score = ((sim + 1) / 2) * 4 + 1
    score_int = int(round(score))
    score_int = min(5, max(1, score_int))  # clamp to [1,5]

    predictions.append({"id": key, "prediction": score_int}) # Use original key as ID

# Save predictions.jsonl
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"âœ… Saved {len(predictions)} predictions to {OUT_PATH}")

âœ… Saved 588 predictions to input/res/predictions.jsonl


In [None]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file input/res/predictions.jsonl on input/ref/solution.jsonl
----------
Spearman Correlation: 0.4009626490796731
Spearman p-Value: 4.054177562458891e-24
----------
Accuracy: 0.5629251700680272 (331/588)
Results dumped into scores.json successfully.


In [None]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 91%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-----------------------------------------------------------