In [1]:
!pip install -q transformers datasets accelerate

import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import KLDivLoss

from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from scipy.stats import spearmanr
from tqdm import tqdm

print("‚úÖ All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

‚úÖ All imports successful!
PyTorch version: 2.9.0+cu126
CUDA available: True


In [2]:
from google.colab import files
import os

print("üìÅ Please upload your train_aug.json and dev.json files:")
uploaded = files.upload()

# Create data directory
os.makedirs("data", exist_ok=True)

# Move uploaded files
for filename in uploaded.keys():
    if filename.endswith('.json'):
        os.rename(filename, f"data/{filename}")
        print(f"‚úÖ Moved {filename} to data/")

print("\nüìä Files in data directory:")
!ls -lh data/

üìÅ Please upload your train_aug.json and dev.json files:


Saving dev.json to dev.json
Saving train2.json to train2.json
‚úÖ Moved dev.json to data/
‚úÖ Moved train2.json to data/

üìä Files in data directory:
total 4.1M
-rw-r--r-- 1 root root 503K Dec  3 13:39 dev.json
-rw-r--r-- 1 root root 3.6M Dec  3 13:39 train2.json


In [3]:
def make_soft_labels_empirical(choices):
    """
    Use ACTUAL human rating distribution instead of Gaussian
    This is more faithful to the data!
    """
    counts = np.zeros(5, dtype=np.float32)
    for rating in choices:
        counts[rating - 1] += 1  # Convert 1-5 to 0-4 indexing

    # Normalize to probability
    probs = counts / counts.sum()
    return probs


def make_soft_labels_adaptive(avg, stdev, choices):
    """
    Adaptive smoothing based on agreement level
    High agreement ‚Üí Sharp distribution
    Low agreement ‚Üí Flatter distribution
    """
    centers = np.arange(1, 6, dtype=np.float32)

    num_ratings = len(choices)
    unique_ratings = len(set(choices))

    # Adaptive sigma based on agreement
    if unique_ratings == 1:
        sigma = 0.3  # Perfect agreement - very sharp
    elif unique_ratings == 2 and stdev < 1.0:
        sigma = 0.5  # High agreement - sharp
    else:
        sigma = min(max(stdev, 0.4), 1.5)  # Cap between 0.4 and 1.5

    dist = np.exp(-0.5 * ((centers - avg) / sigma)**2)
    dist = dist / dist.sum()
    return dist

# ============================================================
# Helper: Mark target word with [TGT] markers
# ============================================================
def mark_target_word(text, homonym):
    """Add [TGT] markers around the homonym"""
    return text.replace(homonym, f"[TGT] {homonym}")

print("‚úÖ Helper functions defined!")
print("\nExample empirical distribution:")
print("  Choices: [4, 5, 2, 3, 1]")
print("  Distribution:", make_soft_labels_empirical([4, 5, 2, 3, 1]))

‚úÖ Helper functions defined!

Example empirical distribution:
  Choices: [4, 5, 2, 3, 1]
  Distribution: [0.2 0.2 0.2 0.2 0.2]


In [9]:
# ============================================================
# Load AmbiStory Dataset
# ============================================================
TRAIN_FILE = "data/train2.json"
DEV_FILE = "data/dev.json"

with open(TRAIN_FILE, "r") as f:
    train_data = json.load(f)

with open(DEV_FILE, "r") as f:
    dev_data = json.load(f)

train_list = train_data if isinstance(train_data, list) else list(train_data.values())
dev_list = dev_data if isinstance(dev_data, list) else list(dev_data.values())

print(f"üìä Loaded {len(train_list)} training samples")
print(f"üìä Loaded {len(dev_list)} dev samples")


# DEBUG: Find the problematic field
def check_types(data_list):
    type_map = {}
    for i, item in enumerate(data_list):
        for key, value in item.items():
            if key not in type_map:
                type_map[key] = {}
            val_type = type(value).__name__
            if val_type not in type_map[key]:
                type_map[key][val_type] = []
            type_map[key][val_type].append(i)

    # Show fields with mixed types
    print("\nüîç Checking for mixed types:")
    for key, types in type_map.items():
        if len(types) > 1:
            print(f"\n‚ùå Field '{key}' has mixed types:")
            for t, indices in types.items():
                print(f"   {t}: {len(indices)} occurrences (first at index {indices[0]})")
                if indices[0] < len(data_list):
                    print(f"      Example value: {data_list[indices[0]][key]}")

check_types(train_list)


def clean_data(data_list):
    """Ensure all fields have consistent types for Arrow"""
    cleaned = []
    for item in data_list:
        cleaned_item = {
            'id': int(item.get('id', 0)) if item.get('id') is not None else 0,
            'homonym': str(item.get('homonym', '')),
            'judged_meaning': str(item.get('judged_meaning', '')),
            'precontext': str(item.get('precontext') or ''),
            'sentence': str(item.get('sentence', '')),
            'ending': str(item.get('ending') or ''),
            'choices': list(item.get('choices', [])) if isinstance(item.get('choices'), list) else [],
            'nonsensical': bool(item.get('nonsensical', False)),
            'average': float(item.get('average', 0)) if item.get('average') is not None else 0.0,
            'stdev': float(item.get('stdev', 0.0)) if item.get('stdev') is not None else 0.0,
            'example_sentence': str(item.get('example_sentence') or '')
        }
        cleaned.append(cleaned_item)
    return cleaned


def preprocess_ambistory(row):
    """
    Preprocess with YOUR improvements:
    1. Rich meaning representation (definition + example)
    2. Empirical soft labels from actual ratings
    """
    # Build story
    story = (
        row["precontext"].strip() + " " +
        row["sentence"].strip() + " " +
        row["ending"].strip()
    ).strip()
    story = mark_target_word(story, row["homonym"])

    # Rich meaning representation (definition + example)
    if row["example_sentence"] and len(str(row["example_sentence"]).strip()) > 0:
        example = f"Definition: {row['judged_meaning']}. Example: {row['example_sentence']}"
    else:
        example = f"Definition: {row['judged_meaning']}"

    # Use empirical distribution from actual ratings
    labels = make_soft_labels_empirical(row["choices"])

    return {
        "story": story,
        "example": example,
        "labels": labels,
        "stdev": row["stdev"]
    }

# Clean the data first
train_list = clean_data(train_list)
dev_list = clean_data(dev_list)

# Apply preprocessing
train_ds = Dataset.from_list(train_list)
dev_ds = Dataset.from_list(dev_list)

train_ds = train_ds.map(preprocess_ambistory)
dev_ds = dev_ds.map(preprocess_ambistory)

print("‚úÖ Preprocessing complete!")
print(f"\nExample preprocessed sample:")
print(f"  Story (first 100 chars): {train_ds[0]['story'][:100]}...")
print(f"  Example (first 100 chars): {train_ds[0]['example'][:100]}...")
print(f"  Labels: {train_ds[0]['labels']}")

üìä Loaded 4864 training samples
üìä Loaded 588 dev samples

üîç Checking for mixed types:

‚ùå Field 'average' has mixed types:
   float: 3669 occurrences (first at index 0)
      Example value: 3.0
   int: 1195 occurrences (first at index 2281)
      Example value: 5

‚ùå Field 'nonsensical' has mixed types:
   list: 2280 occurrences (first at index 0)
      Example value: [False, False, False, False, False]
   bool: 2584 occurrences (first at index 2280)
      Example value: False


Map:   0%|          | 0/4864 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

‚úÖ Preprocessing complete!

Example preprocessed sample:
  Story (first 100 chars): The old machine hummed in the corner of the workshop. Clara examined its dusty dials with a furrowed...
  Example (first 100 chars): Definition: the difference in electrical charge between two points in a circuit expressed in volts. ...
  Labels: [0.20000000298023224, 0.20000000298023224, 0.20000000298023224, 0.20000000298023224, 0.20000000298023224]


In [11]:

MODEL_NAME = "microsoft/deberta-v3-large"  # Using base instead of large

print(f"üì• Loading {MODEL_NAME}...")
print("‚è≥ This will take 1-2 minutes...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("‚úÖ Tokenizer loaded")

config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=5,
    problem_type="multi_label_classification"
)
print("‚úÖ Config loaded")

base_model = AutoModel.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32
)
print("‚úÖ Base model loaded")
print(f"   Parameters: {sum(p.numel() for p in base_model.parameters()) / 1e6:.1f}M")

üì• Loading microsoft/deberta-v3-large...
‚è≥ This will take 1-2 minutes...
‚úÖ Tokenizer loaded
‚úÖ Config loaded


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

‚úÖ Base model loaded
   Parameters: 434.0M


In [12]:

class ImprovedDebertaModel(PreTrainedModel):
    config_class = AutoConfig

    def __init__(self, config, base_model):
        super().__init__(config)
        self.backbone = base_model
        hidden = config.hidden_size
        self.classifier = nn.Linear(hidden, 5)
        self.kl_loss = KLDivLoss(reduction="batchmean")

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled)

        if labels is None:
            return {"logits": logits}

        # KL Divergence loss (distribution matching)
        log_probs = torch.log_softmax(logits, dim=-1)
        kl_loss = self.kl_loss(log_probs, labels)

        # MSE loss on expected scores
        probs = torch.softmax(logits, dim=-1)
        centers = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32, device=logits.device)

        pred_score = (probs * centers).sum(dim=-1)
        true_score = (labels * centers).sum(dim=-1)
        mse_loss = F.mse_loss(pred_score, true_score)

        # Hybrid loss: 70% KL + 30% MSE
        loss = 0.7 * kl_loss + 0.3 * mse_loss

        return {"loss": loss, "logits": logits, "kl_loss": kl_loss, "mse_loss": mse_loss}

# Initialize model
model = ImprovedDebertaModel(config, base_model)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

print("‚úÖ Model with hybrid loss initialized!")
print(f"   Device: {next(model.parameters()).device}")

‚úÖ Model with hybrid loss initialized!
   Device: cuda:0


In [13]:
# ============================================================
# Tokenize datasets
# ============================================================
def tokenize_function(row):
    """Cross-encoder: encode story and example together"""
    enc = tokenizer(
        row["story"],
        row["example"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    enc["labels"] = row["labels"]
    return enc

print("üî§ Tokenizing datasets...")
train_tok = train_ds.map(tokenize_function, batched=True)
dev_tok = dev_ds.map(tokenize_function, batched=True)

train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dev_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("‚úÖ Tokenization complete!")
print(f"   Training samples: {len(train_tok)}")
print(f"   Dev samples: {len(dev_tok)}")

üî§ Tokenizing datasets...


Map:   0%|          | 0/4864 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!
   Training samples: 4864
   Dev samples: 588


In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert to numpy first (they're already numpy arrays from Trainer)
    if isinstance(logits, tuple):
        logits = logits[0]

    logits = np.array(logits)
    labels = np.array(labels)

    # Compute probabilities
    # Apply softmax manually
    logits_exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
    probs = logits_exp / np.sum(logits_exp, axis=-1, keepdims=True)

    centers = np.array([1, 2, 3, 4, 5])

    # Compute predicted scores (expected value)
    pred_scores = np.sum(probs * centers, axis=1)

    # Compute true scores (expected value from soft labels)
    true_scores = np.sum(labels * centers, axis=1)

    # Spearman correlation
    spear = spearmanr(pred_scores, true_scores).correlation

    # Accuracy within stdev
    # Get stdevs from original dataset
    try:
        stdevs = np.array([dev_ds[i]["stdev"] for i in range(len(pred_scores))])
        stdevs = np.maximum(stdevs, 1.0)  # Minimum threshold of 1.0
        sd_acc = np.mean(np.abs(pred_scores - true_scores) <= stdevs)
    except:
        # Fallback if stdev not available
        sd_acc = np.mean(np.abs(pred_scores - true_scores) <= 1.0)

    # Also compute fixed ¬±1.0 accuracy
    acc_1 = np.mean(np.abs(pred_scores - true_scores) <= 1.0)

    return {
        "spearman": float(spear),
        "sd_accuracy": float(sd_acc),
        "accuracy_1": float(acc_1)
    }

print("‚úÖ Metrics defined (FIXED)!")

‚úÖ Metrics defined (FIXED)!


In [15]:

training_args = TrainingArguments(
    output_dir="./model_output",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,  #lower LR for longer training
    weight_decay=0.01,
    warmup_steps=100,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    fp16=True,

    load_best_model_at_end=True,
    metric_for_best_model="sd_accuracy",
    greater_is_better=True,

    report_to="none",  # Disable wandb
    save_total_limit=2,  # Keep only 2 checkpoints
)

print("‚úÖ Training arguments configured!")
print(f"   Total epochs: {training_args.num_train_epochs}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")

‚úÖ Training arguments configured!
   Total epochs: 10
   Effective batch size: 16
   Learning rate: 1e-05


In [16]:
# ============================================================
# Initialize Trainer
# ============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=dev_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("‚úÖ Trainer initialized!")
print("\n" + "="*70)
print("üöÄ STARTING TRAINING")
print("="*70)
print(f"Training samples: {len(train_tok)}")
print(f"Validation samples: {len(dev_tok)}")
print(f"Expected time: ~60-90 minutes")
print("="*70 + "\n")

# Start training
trainer.train()

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETE!")
print("="*70)

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


‚úÖ Trainer initialized!

üöÄ STARTING TRAINING
Training samples: 4864
Validation samples: 588
Expected time: ~60-90 minutes



Epoch,Training Loss,Validation Loss,Spearman,Sd Accuracy,Accuracy 1
1,0.7003,0.679467,0.649815,0.777211,0.729592
2,0.5662,0.660313,0.68709,0.790816,0.715986
3,0.4624,0.644148,0.691943,0.826531,0.767007
4,0.3805,0.659011,0.697443,0.795918,0.736395
5,0.3295,0.746759,0.690922,0.784014,0.712585
6,0.3124,0.656226,0.70776,0.812925,0.760204
7,0.2591,0.715672,0.705563,0.80102,0.736395
8,0.2357,0.678161,0.7037,0.807823,0.756803
9,0.2259,0.701627,0.701149,0.804422,0.738095
10,0.2107,0.71416,0.699015,0.804422,0.734694



‚úÖ TRAINING COMPLETE!


In [17]:
# ============================================================
# Save the best model
# ============================================================
MODEL_SAVE_DIR = "./best_model"

print("üíæ Saving best model...")
trainer.save_model(MODEL_SAVE_DIR)
tokenizer.save_pretrained(MODEL_SAVE_DIR)

# Save training info
training_info = {
    "model_name": MODEL_NAME,
    "num_epochs": training_args.num_train_epochs,
    "learning_rate": training_args.learning_rate,
    "batch_size": training_args.per_device_train_batch_size,
    "modifications": [
        "Empirical soft labels from actual ratings",
        "Rich meaning representation (definition + example)",
        "Hybrid loss (KL + MSE)",
        "Adaptive smoothing based on agreement",
        "Optimized for accuracy metric"
    ]
}

with open(f"{MODEL_SAVE_DIR}/training_info.json", "w") as f:
    json.dump(training_info, f, indent=2)

print(f"‚úÖ Model saved to: {MODEL_SAVE_DIR}")
print("\nüìä Final metrics on validation set:")

# Get final evaluation
final_metrics = trainer.evaluate()
for key, value in final_metrics.items():
    if key.startswith("eval_"):
        print(f"   {key}: {value:.4f}")

üíæ Saving best model...
‚úÖ Model saved to: ./best_model

üìä Final metrics on validation set:


   eval_loss: 0.6441
   eval_spearman: 0.6919
   eval_sd_accuracy: 0.8265
   eval_accuracy_1: 0.7670
   eval_runtime: 13.6821
   eval_samples_per_second: 42.9760
   eval_steps_per_second: 10.7440


In [18]:

from safetensors.torch import load_file
print("üîÆ Generating predictions...")

# Reload best model
model = ImprovedDebertaModel(config, base_model)
state_dict = load_file(f"{MODEL_SAVE_DIR}/model.safetensors")
model.load_state_dict(state_dict)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

all_predictions = []

with torch.no_grad():
    for i in tqdm(range(len(dev_tok)), desc="Predicting"):
        sample = {
            "input_ids": dev_tok[i]["input_ids"].unsqueeze(0).to(model.device),
            "attention_mask": dev_tok[i]["attention_mask"].unsqueeze(0).to(model.device)
        }

        outputs = model(**sample)
        logits = outputs["logits"]
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]

        # Compute expected value
        centers = np.array([1, 2, 3, 4, 5])
        pred_score = (probs * centers).sum()
        pred_class = int(np.clip(np.round(pred_score), 1, 5))

        all_predictions.append(pred_class)

print(f"‚úÖ Generated {len(all_predictions)} predictions")

üîÆ Generating predictions...



Predicting:   0%|          | 0/588 [00:00<?, ?it/s][A
Predicting:   0%|          | 1/588 [00:00<01:57,  5.00it/s][A
Predicting:   0%|          | 2/588 [00:00<03:09,  3.09it/s][A
Predicting:   1%|          | 3/588 [00:00<02:12,  4.42it/s][A
Predicting:   1%|          | 4/588 [00:00<01:52,  5.21it/s][A
Predicting:   1%|          | 6/588 [00:01<01:17,  7.50it/s][A
Predicting:   1%|‚ñè         | 8/588 [00:01<01:04,  8.96it/s][A
Predicting:   2%|‚ñè         | 10/588 [00:01<00:58,  9.91it/s][A
Predicting:   2%|‚ñè         | 12/588 [00:01<00:54, 10.52it/s][A
Predicting:   2%|‚ñè         | 14/588 [00:01<00:52, 10.87it/s][A
Predicting:   3%|‚ñé         | 16/588 [00:01<00:51, 11.03it/s][A
Predicting:   3%|‚ñé         | 18/588 [00:02<00:50, 11.35it/s][A
Predicting:   3%|‚ñé         | 20/588 [00:02<00:49, 11.54it/s][A
Predicting:   4%|‚ñé         | 22/588 [00:02<00:48, 11.63it/s][A
Predicting:   4%|‚ñç         | 24/588 [00:02<00:48, 11.70it/s][A
Predicting:   4%|‚ñç         | 26/58

‚úÖ Generated 588 predictions





In [1]:

import os

OUTPUT_DIR = "input/res"
os.makedirs(OUTPUT_DIR, exist_ok=True)

output_path = f"{OUTPUT_DIR}/predictions.jsonl"

with open(output_path, "w") as f:
    for idx, pred in enumerate(all_predictions):
        record = {"id": str(idx), "prediction": pred}
        f.write(json.dumps(record) + "\n")

print(f"‚úÖ Predictions saved to: {output_path}")

# Show distribution of predictions
unique, counts = np.unique(all_predictions, return_counts=True)
print("\nüìä Prediction distribution:")
for score, count in zip(unique, counts):
    print(f"   Score {score}: {count} samples ({count/len(all_predictions)*100:.1f}%)")

# Download the file
print("\nüì• Downloading predictions file...")
from google.colab import files
files.download(output_path)

NameError: name 'all_predictions' is not defined

In [None]:

# Upload solution file
print("Upload solution.jsonl if you want to evaluate:")
uploaded_sol = files.upload()

if "solution.jsonl" in uploaded_sol:
    os.makedirs("input/ref", exist_ok=True)
    os.rename("solution.jsonl", "input/ref/solution.jsonl")

    # Run scoring (assuming you have scoring.py)
    !python scoring.py input/ref/solution.jsonl input/res/predictions.jsonl output/scores.json

    # Load and display results
    if os.path.exists("output/scores.json"):
        with open("output/scores.json") as f:
            scores = json.load(f)

        print("\n" + "="*70)
        print("üèÜ FINAL RESULTS")
        print("="*70)
        print(f"Spearman Correlation: {scores.get('spearman', 'N/A')}")
        print(f"Accuracy: {scores.get('accuracy', 'N/A')}")
        print("="*70)
else:
    print("‚ö†Ô∏è No solution file uploaded. Skipping evaluation.")

In [None]:
!zip -j my_submission.zip input/res/predictions.jsonl
from google.colab import files
files.download("my_submission.zip")

  adding: predictions.jsonl (deflated 90%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>