In [None]:
!pip install transformers datasets

In [None]:
pip install -q sentence-transformers scipy

In [None]:
# create Codabench-style folders
!mkdir -p input/ref input/res output
# move the reference (ground truth) file
!mkdir -p input/ref input/res output
!mv solution.jsonl input/ref/solution.jsonl
!mv predictions.jsonl input/res/predictions.jsonl
!echo "✅ Folder structure:"
!tree -L 2

mv: cannot stat 'solution.jsonl': No such file or directory
mv: cannot stat 'predictions.jsonl': No such file or directory
✅ Folder structure:
/bin/bash: line 1: tree: command not found


In [None]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file input/res/predictions.jsonl on input/ref/solution.jsonl
----------
Spearman Correlation: 0.04940429751709947
Spearman p-Value: 0.2316295121977508
----------
Accuracy: 0.4608843537414966 (271/588)
Results dumped into scores.json successfully.


In [None]:
from google.colab import files
uploaded = files.upload()   # click "Choose files" and select the files (you can multi-select)
print("Uploaded:", list(uploaded.keys()))

Saving evaluate.py to evaluate.py
Saving format_check.py to format_check.py
Saving predictions.jsonl to predictions.jsonl
Saving scoring.py to scoring.py
Saving solution.jsonl to solution.jsonl
Uploaded: ['evaluate.py', 'format_check.py', 'predictions.jsonl', 'scoring.py', 'solution.jsonl']


In [None]:
# create folders
import os
os.makedirs("data", exist_ok=True)
os.makedirs("input/ref", exist_ok=True)
os.makedirs("input/res", exist_ok=True)
os.makedirs("output", exist_ok=True)

# If you don't have data files in the workspace, upload them now:
print("If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.")
from google.colab import files
uploaded = files.upload()  # use the file chooser to upload train.json and dev.json if needed
print("Uploaded:", list(uploaded.keys()))

# If you uploaded a solution.jsonl file here and want it in input/ref:
if "solution.jsonl" in uploaded:
    os.replace("solution.jsonl", "input/ref/solution.jsonl")
    print("Moved solution.jsonl -> input/ref/solution.jsonl")

print("\nCurrent data folder contents:")
!ls -la data || true
print("\ninput/ref contents:")
!ls -la input/ref || true
print("\ninput/res contents:")
!ls -la input/res || true

If you already uploaded data/dev.json and data/train.json, ignore the upload prompt.


Saving dev.json to dev.json
Saving train.json to train.json
Uploaded: ['dev.json', 'train.json']

Current data folder contents:
total 8
drwxr-xr-x 2 root root 4096 Oct 31 18:40 .
drwxr-xr-x 1 root root 4096 Oct 31 18:40 ..

input/ref contents:
total 8
drwxr-xr-x 2 root root 4096 Oct 31 18:39 .
drwxr-xr-x 4 root root 4096 Oct 31 18:39 ..

input/res contents:
total 8
drwxr-xr-x 2 root root 4096 Oct 31 18:39 .
drwxr-xr-x 4 root root 4096 Oct 31 18:39 ..


In [None]:
# STEP 3 — Train or load model (token-level cosine sim between story-word and example-word)
import os, json, pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F # Import F for torch.nn.functional alias

MODEL_DIR = "./sbert-ambi"
TRAIN_FILE = "data/train.json"
DEV_FILE = "data/dev.json"

# --- Helper: build story (from your original code) ---
def build_story(row):
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), str(row.get("sentence", ""))]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending","")))
    return " ".join([p for p in parts if p])

# --- Load model or prepare for training ---
if os.path.exists(MODEL_DIR) and os.path.isdir(MODEL_DIR) and any(os.scandir(MODEL_DIR)):
    print(f"Found saved model at {MODEL_DIR}, loading it (no training).")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
    base_model = AutoModel.from_pretrained(MODEL_DIR)
else:
    # --- Load and prepare data ---
    with open(TRAIN_FILE, "r") as f:
        data = json.load(f)

    records = list(data.values()) if isinstance(data, dict) else data
    df = pd.DataFrame(records)

    # Split
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)

    # Build story column + normalize
    train_df["story"] = train_df.apply(build_story, axis=1)
    val_df["story"] = val_df.apply(build_story, axis=1)
    train_df["average_norm"] = (train_df["average"] - 1.0) / 4.0
    val_df["average_norm"] = (val_df["average"] - 1.0) / 4.0

    # --- Model and tokenizer ---
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    base_model = AutoModel.from_pretrained(model_name)

    # --- Custom Dataset ---
    class WordPairDataset(Dataset):
        def __init__(self, df):
            self.data = df
        def __len__(self):
            return len(self.data)
        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            return {
                "story": row["story"],
                "example_sentence": row["example_sentence"],
                "homonym": row["homonym"],
                "label": torch.tensor(float(row["average_norm"]), dtype=torch.float)
            }

    train_dataset = WordPairDataset(train_df)
    val_dataset = WordPairDataset(val_df)

    # --- STABILITY FIX: REDUCED BATCH SIZE ---
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)

    # --- ADDED: Validation DataLoader ---
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=4)

    # --- Custom Training Loop ---
    optimizer = torch.optim.AdamW(base_model.parameters(), lr=3e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    base_model.to(device)

    # --- Helper function for target word embedding ---
    def get_target_embedding(text, homonym):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        # Ensure model is in the correct mode (train or eval) when calling
        if base_model.training:
            outputs = base_model(**inputs)
        else:
            with torch.no_grad():
                outputs = base_model(**inputs)

        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        try:
            if homonym in tokens:
                idx = tokens.index(homonym)
            else:
                idx = -1
                for i, token in enumerate(tokens):
                    # Look for the first sub-word piece
                    if token.lower().startswith(homonym.lower()):
                        idx = i
                        break
                if idx == -1:
                     # Fallback to mean pooling if no token matches
                     return outputs.last_hidden_state.mean(dim=1).squeeze()

            return outputs.last_hidden_state[0, idx, :]

        except Exception:
            # Catch all: If any indexing fails, use mean embedding
            return outputs.last_hidden_state.mean(dim=1).squeeze()

    # --- Training ---
    print("🚀 Starting training on target-word embeddings...")

    # --- STABILITY FIX: CUDA MEMORY CLEAR ---
    if device.type == 'cuda':
        torch.cuda.empty_cache()
        print("✅ CUDA memory cleared.")

    for epoch in range(4):
        base_model.train() # Set model to training mode
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            losses = []

            for i in range(len(batch["story"])):
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                cos_sim = F.cosine_similarity(emb_story, emb_examp, dim=0)
                loss = (cos_sim - batch["label"][i].to(device)) ** 2
                losses.append(loss)

            loss_batch = torch.stack(losses).mean()
            loss_batch.backward()
            optimizer.step()
            total_loss += loss_batch.item()

        print(f"Epoch {epoch+1}: Avg training loss = {total_loss/len(train_dataloader):.4f}")

        # --- ADDED: Validation Loop ---
        base_model.eval() # Set model to evaluation mode
        all_labels = []
        all_sims = []

        for batch in val_dataloader: # Iterate over validation data
            labels_batch = batch["label"].numpy()
            all_labels.extend(labels_batch)

            sims_batch = []
            for i in range(len(batch["story"])):
                # get_target_embedding will use torch.no_grad() internally
                # because base_model.training is False
                emb_story = get_target_embedding(batch["story"][i], batch["homonym"][i])
                emb_examp = get_target_embedding(batch["example_sentence"][i], batch["homonym"][i])

                sim = F.cosine_similarity(emb_story, emb_examp, dim=0).item()
                sims_batch.append(sim)

            all_sims.extend(sims_batch)

        # Calculate metrics
        labels_np = np.array(all_labels)
        sims_np = np.array(all_sims)

        spearman_corr = spearmanr(labels_np, sims_np).correlation

        # Convert predictions (sims) and labels back to 1-5 scale for accuracy check
        labels_scaled = (labels_np * 4.0) + 1.0
        sims_scaled = ((sims_np + 1.0) / 2.0) * 4.0 + 1.0

        # Calculate accuracy: % of predictions within 1.0 of the true score
        # This matches the "Acc_within_std" from your SBERT evaluator (which was a 1.0 threshold, not stdev)
        acc_std = np.mean(np.abs(sims_scaled - labels_scaled) <= 1.0)

        print(f"Epoch {epoch+1} Validation — Spearman: {spearman_corr:.4f}, Acc_within_1.0: {acc_std:.4f}")
        # --- End of Validation Loop ---

    # --- Save model ---
    os.makedirs(MODEL_DIR, exist_ok=True)
    base_model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print("✅ Training finished and model saved to", MODEL_DIR)

print("✅ Model ready:", MODEL_DIR)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Starting training on target-word embeddings...
✅ CUDA memory cleared.
Epoch 1: Avg training loss = 0.0721
Epoch 1 Validation — Spearman: 0.6231, Acc_within_1.0: 0.5000


In [None]:
# STEP 4 — Generate predictions.jsonl using your trained word-level model
import os, json, torch, numpy as np
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F # Import F for torch.nn.functional alias

MODEL_PATH = "./sbert-ambi"
DATA_PATH = "data/dev.json"          # switch to data/test.json when submitting
OUT_PATH  = "input/res/predictions.jsonl"

# Load fine-tuned model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModel.from_pretrained(MODEL_PATH).to(device)
model.eval()

# Helper: build story text (same as before)
def build_story(row):
    # This must match the structure used in training!
    sentence = str(row["sentence"]).replace(row["homonym"], f"[TGT] {row['homonym']} [TGT]", 1)
    parts = [str(row.get("precontext", "")), str(row.get("sentence", ""))]
    if row.get("ending", "") not in [None, ""]:
        parts.append(str(row.get("ending", "")))
    return " ".join([p for p in parts if p])

# Function to extract contextual embedding for the target word (MUST match Step 3 logic!)
def get_target_embedding(text, homonym):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Logic to find the index of the first tokenized piece of the homonym/target word
    try:
        if homonym in tokens:
            idx = tokens.index(homonym)
        else:
            idx = -1
            for i, token in enumerate(tokens):
                if homonym.lower() in token.lower():
                    idx = i
                    break
            if idx == -1:
                # Fallback to mean embedding if token not found
                emb = outputs.last_hidden_state.mean(dim=1).squeeze()
                return emb

        emb = outputs.last_hidden_state[0, idx, :]

    except Exception:
        # Fallback: use mean embedding if any indexing fails
        emb = outputs.last_hidden_state.mean(dim=1).squeeze()

    return emb


# Load dataset
with open(DATA_PATH, "r") as f:
    data = json.load(f)

records = list(data.values()) if isinstance(data, dict) else data

predictions = []
# Ensure unique IDs are maintained as per the input data (which uses string keys)
for key, sample in data.items():
    story = build_story(sample)
    example_sentence = sample["example_sentence"]
    homonym = sample["homonym"]

    # Get target word embeddings
    emb_story = get_target_embedding(story, homonym)
    emb_example = get_target_embedding(example_sentence, homonym)

    # Compute cosine similarity between the two embeddings
    # Using F.cosine_similarity from the imported alias
    sim = F.cosine_similarity(emb_story, emb_example, dim=0).item()

    # Map similarity (-1..1) → 1..5
    score = ((sim + 1) / 2) * 4 + 1
    score_int = int(round(score))
    score_int = min(5, max(1, score_int))  # clamp to [1,5]

    predictions.append({"id": key, "prediction": score_int}) # Use original key as ID

# Save predictions.jsonl
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
with open(OUT_PATH, "w") as f:
    for p in predictions:
        f.write(json.dumps(p) + "\n")

print(f"✅ Saved {len(predictions)} predictions to {OUT_PATH}")

✅ Saved 588 predictions to input/res/predictions.jsonl


In [None]:
!python3 scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file input/res/predictions.jsonl on input/ref/solution.jsonl
----------
Spearman Correlation: 0.3700167855979693
Spearman p-Value: 1.6185365455525804e-20
----------
Accuracy: 0.5340136054421769 (314/588)
Results dumped into scores.json successfully.


In [None]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 91%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

-----------------------------------------------------------