In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.chdir(BASE_DIR)
print("Current working directory:", os.getcwd())


Current working directory: /content/drive/MyDrive/AML Challenge


# Pre-Processing

In [3]:
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np, pandas as pd
# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]   # (125000, 1024)
im_train = train_data["images/embeddings"]     # (25000, 1536)
tx_test  = test_data["captions/embeddings"]    # (1500, 1024)

print("Train shapes:", tx_train.shape, im_train.shape)
print("Test shape:", tx_test.shape)

# ------------------------------------------------------
# 2. Match each caption to its corresponding image
# ------------------------------------------------------
repeat_factor = len(tx_train) // len(im_train)   # 5 captions per image
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

Using device: cuda
Train shapes: (125000, 1024) (25000, 1536)
Test shape: (1500, 1024)


# Experiment 1: got score as 0.81780
# Residual-Orthogonal + Contrastive version

In [None]:
# ------------------------------------------------------
# 2. Match each caption to its corresponding image
# ------------------------------------------------------
repeat_factor = len(tx_train) // len(im_train)   # 5 captions per image
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

# ------------------------------------------------------
# 3. Convert to tensors + center + normalize
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# Center + normalize (same mean for test)
tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t.mean(0, keepdim=True)

tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_mean, p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_mean, p=2, dim=1)

# Expand image embeddings for each caption
im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

# ------------------------------------------------------
# 4. Compute Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh  # (1536 × 1024)

print("Computed orthogonal base R:", R.shape)

# ------------------------------------------------------
# 5. Define Residual-Orthogonal Translator
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        out = F.normalize(base + res, p=2, dim=1)
        return out

# ------------------------------------------------------
# 6. Define losses
# ------------------------------------------------------
def contrastive_loss(pred, target, tau=0.07):
    sims = pred @ target.T / tau
    labels = torch.arange(pred.size(0), device=device)
    return F.cross_entropy(sims, labels)

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 7. DataLoader
# ------------------------------------------------------
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 8. Initialize model + optimizer
# ------------------------------------------------------
model = ResidualTranslator(R.detach().clone()).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

# ------------------------------------------------------
# 9. Training loop
# ------------------------------------------------------
EPOCHS = 50
print("\nTraining Residual-Orthogonal Translator...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        y_pred = model(x_batch)
        idx = torch.randperm(x_batch.size(0), device=device)
        y_neg = y_batch[idx]

        loss_cos = contrastive_loss(y_pred, y_batch)
        loss_tri = triplet_loss_fn(y_pred, y_batch, y_neg)
        loss = 0.6 * loss_cos + 0.4 * loss_tri

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch:02d}: avg loss = {avg_loss:.4f}")

torch.save(model.state_dict(), "residual_orthogonal.pth")
print("\n✅ Training completed and model saved as residual_orthogonal.pth")

# ------------------------------------------------------
# 10. Inference for submission
# ------------------------------------------------------
model.eval()
with torch.no_grad():
    tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
    preds = model(tx_test_n).cpu().numpy()

# ------------------------------------------------------
# 11. Save submission file
# ------------------------------------------------------
test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_residual.csv", index=False)

print("✅ Saved submission_residual.csv")
print(submission.head(3))

Computed orthogonal base R: torch.Size([1536, 1024])

Training Residual-Orthogonal Translator...





Epoch 01: avg loss = 1.5180




Epoch 02: avg loss = 1.3856




Epoch 03: avg loss = 1.3359




Epoch 04: avg loss = 1.3061




Epoch 05: avg loss = 1.2844




Epoch 06: avg loss = 1.2677




Epoch 07: avg loss = 1.2551




Epoch 08: avg loss = 1.2456




Epoch 09: avg loss = 1.2356




Epoch 10: avg loss = 1.2279




Epoch 11: avg loss = 1.2205




Epoch 12: avg loss = 1.2146




Epoch 13: avg loss = 1.2085




Epoch 14: avg loss = 1.2038




Epoch 15: avg loss = 1.1987




Epoch 16: avg loss = 1.1950




Epoch 17: avg loss = 1.1912




Epoch 18: avg loss = 1.1873




Epoch 19: avg loss = 1.1838




Epoch 20: avg loss = 1.1801




Epoch 21: avg loss = 1.1774




Epoch 22: avg loss = 1.1750




Epoch 23: avg loss = 1.1724




Epoch 24: avg loss = 1.1698




Epoch 25: avg loss = 1.1676




Epoch 26: avg loss = 1.1649




Epoch 27: avg loss = 1.1626




Epoch 28: avg loss = 1.1611




Epoch 29: avg loss = 1.1584




Epoch 30: avg loss = 1.1563




Epoch 31: avg loss = 1.1551




Epoch 32: avg loss = 1.1540




Epoch 33: avg loss = 1.1522




Epoch 34: avg loss = 1.1503




Epoch 35: avg loss = 1.1491




Epoch 36: avg loss = 1.1479




Epoch 37: avg loss = 1.1457




Epoch 38: avg loss = 1.1442




Epoch 39: avg loss = 1.1431




Epoch 40: avg loss = 1.1413




Epoch 41: avg loss = 1.1420




Epoch 42: avg loss = 1.1397




Epoch 43: avg loss = 1.1392




Epoch 44: avg loss = 1.1381




Epoch 45: avg loss = 1.1366




Epoch 46: avg loss = 1.1360




Epoch 47: avg loss = 1.1351




Epoch 48: avg loss = 1.1343




Epoch 49: avg loss = 1.1329




Epoch 50: avg loss = 1.1321

✅ Training completed and model saved as residual_orthogonal.pth
✅ Saved submission_residual.csv
   id                                          embedding
0   1  [-0.010053236037492752, 0.014468204230070114, ...
1   2  [-0.03426738828420639, -0.030702663585543633, ...
2   3  [-0.0029787796083837748, -0.02091299369931221,...


# hard_negative approach

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
from google.colab import drive


# ------------------------------------------------------
# 3. Convert to tensors + center + normalize
#    (Using the exact preprocessing from your 0.81780 experiment)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device) # For R
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# Center + normalize (same mean for test)
tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True) # Mean of unique images

tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_mean, p=2, dim=1)

# Expand image embeddings for each caption (for training)
im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 4. Compute Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh  # (1536 × 1024)

print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 5. Define Residual-Orthogonal Translator
#    (Same model as your 0.81780 experiment)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        out = F.normalize(base + res, p=2, dim=1)
        return out

# ------------------------------------------------------
# 6. Define losses
# ------------------------------------------------------
def contrastive_loss(pred, target, tau=0.07):
    # pred: (B, D), target: (B, D)
    sims = pred @ target.T / tau
    labels = torch.arange(pred.size(0), device=device)
    return F.cross_entropy(sims, labels)

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 7. DataLoader
# ------------------------------------------------------
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 8. Initialize model + optimizer
# ------------------------------------------------------
model = ResidualTranslator(R.detach().clone()).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scaler = torch.cuda.amp.GradScaler() # For mixed precision

# ------------------------------------------------------
# 9. Training loop with HARD-NEGATIVE MINING
# ------------------------------------------------------
EPOCHS = 30
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_TRIPLET = 0.3 # Weight for the triplet loss term
LOSS_WEIGHT_CONTRASTIVE = 0.7 # Weight for the contrastive loss term

print("\nTraining Residual-Orthogonal Translator with Hard-Negative Mining...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            # --- Forward pass ---
            y_pred = model(x_batch) # (B, 1536)

            # --- 1. Contrastive Loss (InfoNCE) ---
            # This is the main loss, computes (B, B) similarity matrix
            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_cos = F.cross_entropy(sims_with_tau, labels)

            # --- 2. Hard-Negative Mining for Triplet Loss ---
            with torch.no_grad():
                # We need similarities *without* temperature scaling
                sims_no_tau = y_pred @ y_batch.T # (B, B)

                # Mask out the positive sample (the diagonal)
                sims_no_tau.masked_fill_(
                    torch.eye(y_batch.size(0), device=device, dtype=torch.bool),
                    -float('inf')
                )

                # Find the highest-similarity (hardest) negative for each sample
                hard_neg_idx = sims_no_tau.argmax(dim=1) # (B)

            y_hard_neg = y_batch[hard_neg_idx] # (B, 1536)

            # --- 3. Triplet Loss (with hard negatives) ---
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            # --- 4. Combine losses ---
            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_cos) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        # --- Backward pass ---
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch:02d}: avg loss = {avg_loss:.4f}")

torch.save(model.state_dict(), "residual_hard_negative.pth")
print("\n✅ Training completed and model saved as residual_hard_negative.pth")

# ------------------------------------------------------
# 10. Inference for submission
# ------------------------------------------------------
model.eval()
with torch.no_grad():
    # We normalized tx_test_t during preprocessing
    preds = model(tx_test_t).cpu().numpy()

# ------------------------------------------------------
# 11. Save submission file
# ------------------------------------------------------
test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_hard_negative.csv", index=False)

print("✅ Saved submission_hard_negative.csv")
print(submission.head(3))


Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # For mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])

Training Residual-Orthogonal Translator with Hard-Negative Mining...



  with torch.cuda.amp.autocast():


Epoch 01: avg loss = 1.8259




Epoch 02: avg loss = 1.6697




Epoch 03: avg loss = 1.6106




Epoch 04: avg loss = 1.5756




Epoch 05: avg loss = 1.5504




Epoch 06: avg loss = 1.5309




Epoch 07: avg loss = 1.5158




Epoch 08: avg loss = 1.5029




Epoch 09: avg loss = 1.4926




Epoch 10: avg loss = 1.4824




Epoch 11: avg loss = 1.4739




Epoch 12: avg loss = 1.4665




Epoch 13: avg loss = 1.4591




Epoch 14: avg loss = 1.4535




Epoch 15: avg loss = 1.4478




Epoch 16: avg loss = 1.4431




Epoch 17: avg loss = 1.4390




Epoch 18: avg loss = 1.4329




Epoch 19: avg loss = 1.4304




Epoch 20: avg loss = 1.4263




Epoch 21: avg loss = 1.4226




Epoch 22: avg loss = 1.4196




Epoch 23: avg loss = 1.4160




Epoch 24: avg loss = 1.4138




Epoch 25: avg loss = 1.4106




Epoch 26: avg loss = 1.4079




Epoch 27: avg loss = 1.4045




Epoch 28: avg loss = 1.4034




Epoch 29: avg loss = 1.4006




Epoch 30: avg loss = 1.3981

✅ Training completed and model saved as residual_hard_negative.pth
✅ Saved submission_hard_negative.csv
   id                                          embedding
0   1  [-0.010386133566498756, 0.015329176560044289, ...
1   2  [-0.03257265314459801, -0.027175934985280037, ...
2   3  [-0.00526655837893486, -0.014171533286571503, ...


# Experiment 3: Residual + Hard Negative

In [None]:
# ======================================================
#  AML CHALLENGE — Residual-Orthogonal Translator v3
#  (Hard Negatives + Orthogonal Reg + CSLS + Ensemble)
# ======================================================

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np, pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ------------------------------------------------------
# 1. Data preprocessing  (same as before)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 2. Orthogonal Procrustes base
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 3. Model
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res  = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# Orthogonality penalty
def orthogonal_penalty(W):
    WT_W = W.T @ W
    I = torch.eye(WT_W.shape[0], device=W.device)
    return ((WT_W - I)**2).mean()

# ------------------------------------------------------
# 4. Losses
# ------------------------------------------------------
triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 5. Dataloader
# ------------------------------------------------------
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 6. Train
# ------------------------------------------------------
def train_model(seed=42, save_path="residual_v3.pth"):
    torch.manual_seed(seed)
    model = ResidualTranslator(R.detach().clone()).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
    scaler = torch.amp.GradScaler("cuda")

    EPOCHS = 30
    TAU = 0.07
    LOSS_W_CONTR = 0.7
    LOSS_W_TRIP = 0.3
    ORTHO_W = 1e-4

    print(f"\nTraining Residual-Orthogonal Translator (seed={seed})...\n")
    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
            optimizer.zero_grad()
            with torch.amp.autocast("cuda"):
                y_pred = model(x_batch)
                sims = y_pred @ y_batch.T / TAU
                labels = torch.arange(y_pred.size(0), device=device)
                loss_con = F.cross_entropy(sims, labels)

                with torch.no_grad():
                    sims_no_tau = y_pred @ y_batch.T
                    sims_no_tau.masked_fill_(torch.eye(y_batch.size(0), device=device, dtype=torch.bool), -float('inf'))
                    hard_idx = sims_no_tau.argmax(dim=1)
                y_hard = y_batch[hard_idx]
                loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard)

                loss_ortho = orthogonal_penalty(model.R)
                loss = (LOSS_W_CONTR * loss_con) + (LOSS_W_TRIP * loss_tri) + (ORTHO_W * loss_ortho)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"Epoch {epoch:02d}: avg loss = {total_loss/len(train_loader):.4f}")

    torch.save(model.state_dict(), save_path)
    print(f"\n✅ Model saved as {save_path}\n")
    return model

# ------------------------------------------------------
# 7. Train + Ensemble (3 seeds)
# ------------------------------------------------------
models = []
for seed in [42, 77, 123]:
    model = train_model(seed, save_path=f"residual_v3_seed{seed}.pth")
    models.append(model)

# ------------------------------------------------------
# 8. Inference (ensemble-averaged)
# ------------------------------------------------------
@torch.no_grad()
def predict_ensemble(models, tx_test_t):
    preds_all = []
    for model in models:
        model.eval()
        preds_all.append(F.normalize(model(tx_test_t), p=2, dim=1).cpu().numpy())
    return np.mean(preds_all, axis=0)

preds = predict_ensemble(models, tx_test_t)

# ------------------------------------------------------
# 9. CSLS re-ranking (optional at test time)
# ------------------------------------------------------
@torch.no_grad()
def apply_csls(preds, im_base, k=10):
    preds = F.normalize(torch.tensor(preds, device=device), p=2, dim=1)
    im_base = F.normalize(im_base, p=2, dim=1)
    sim = preds @ im_base.T
    knn_q = torch.topk(sim, k=k, dim=1).values.mean(1, keepdim=True)
    knn_d = torch.topk(sim, k=k, dim=0).values.mean(0, keepdim=True)
    return (2 * sim - knn_q - knn_d).cpu().numpy()

csls_sim = apply_csls(preds, im_train_t_unique)

# ------------------------------------------------------
# 10. Save submission
# ------------------------------------------------------
test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_residual_v3.csv", index=False)
print("✅ submission_residual_v3.csv saved.")


Using device: cuda
Data preprocessed and normalized.
Computed orthogonal base R: torch.Size([1536, 1024])

Training Residual-Orthogonal Translator (seed=42)...





Epoch 01: avg loss = 1.8249




Epoch 02: avg loss = 1.6702




Epoch 03: avg loss = 1.6105




Epoch 04: avg loss = 1.5747




Epoch 05: avg loss = 1.5506




Epoch 06: avg loss = 1.5319




Epoch 07: avg loss = 1.5163




Epoch 08: avg loss = 1.5025




Epoch 09: avg loss = 1.4919




Epoch 10: avg loss = 1.4816




Epoch 11: avg loss = 1.4734




Epoch 12: avg loss = 1.4671




Epoch 13: avg loss = 1.4598




Epoch 14: avg loss = 1.4545




Epoch 15: avg loss = 1.4482




Epoch 16: avg loss = 1.4430




Epoch 17: avg loss = 1.4380




Epoch 18: avg loss = 1.4343




Epoch 19: avg loss = 1.4303




Epoch 20: avg loss = 1.4252




Epoch 21: avg loss = 1.4222




Epoch 22: avg loss = 1.4185




Epoch 23: avg loss = 1.4157




Epoch 24: avg loss = 1.4136




Epoch 25: avg loss = 1.4098




Epoch 26: avg loss = 1.4065




Epoch 27: avg loss = 1.4053




Epoch 28: avg loss = 1.4031




Epoch 29: avg loss = 1.3996




Epoch 30: avg loss = 1.3982

✅ Model saved as residual_v3_seed42.pth


Training Residual-Orthogonal Translator (seed=77)...





Epoch 01: avg loss = 1.8243




Epoch 02: avg loss = 1.6696




Epoch 03: avg loss = 1.6102




Epoch 04: avg loss = 1.5757




Epoch 05: avg loss = 1.5505




Epoch 06: avg loss = 1.5311




Epoch 07: avg loss = 1.5163




Epoch 08: avg loss = 1.5034




Epoch 09: avg loss = 1.4916




Epoch 10: avg loss = 1.4823




Epoch 11: avg loss = 1.4739




Epoch 12: avg loss = 1.4669




Epoch 13: avg loss = 1.4601




Epoch 14: avg loss = 1.4538




Epoch 15: avg loss = 1.4480




Epoch 16: avg loss = 1.4431




Epoch 17: avg loss = 1.4387




Epoch 18: avg loss = 1.4351




Epoch 19: avg loss = 1.4306




Epoch 20: avg loss = 1.4263




Epoch 21: avg loss = 1.4222




Epoch 22: avg loss = 1.4196




Epoch 23: avg loss = 1.4163




Epoch 24: avg loss = 1.4131




Epoch 25: avg loss = 1.4102




Epoch 26: avg loss = 1.4079




Epoch 27: avg loss = 1.4046




Epoch 28: avg loss = 1.4033




Epoch 29: avg loss = 1.3996




Epoch 30: avg loss = 1.3976

✅ Model saved as residual_v3_seed77.pth


Training Residual-Orthogonal Translator (seed=123)...





Epoch 01: avg loss = 1.8248




Epoch 02: avg loss = 1.6695




Epoch 03: avg loss = 1.6100




Epoch 04: avg loss = 1.5749




Epoch 05: avg loss = 1.5509




Epoch 06: avg loss = 1.5315




Epoch 07: avg loss = 1.5155




Epoch 08: avg loss = 1.5035




Epoch 09: avg loss = 1.4925




Epoch 10: avg loss = 1.4827




Epoch 11: avg loss = 1.4734




Epoch 12: avg loss = 1.4677




Epoch 13: avg loss = 1.4602




Epoch 14: avg loss = 1.4547




Epoch 15: avg loss = 1.4492




Epoch 16: avg loss = 1.4440




Epoch 17: avg loss = 1.4395




Epoch 18: avg loss = 1.4344




Epoch 19: avg loss = 1.4301




Epoch 20: avg loss = 1.4266




Epoch 21: avg loss = 1.4228




Epoch 22: avg loss = 1.4196




Epoch 23: avg loss = 1.4164




Epoch 24: avg loss = 1.4128




Epoch 25: avg loss = 1.4103




Epoch 26: avg loss = 1.4077




Epoch 27: avg loss = 1.4061




Epoch 28: avg loss = 1.4028




Epoch 29: avg loss = 1.4008




Epoch 30: avg loss = 1.3987

✅ Model saved as residual_v3_seed123.pth

✅ submission_residual_v3.csv saved.


##

#  Model: Residual-Orthogonal TRANSFORMER Translator



In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. Model: Residual-Orthogonal (Simple MLP)
#    (Your proven 0.81780-scoring model)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Contrastive + CORRECT Triplet)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 6. Validation split (10%)
#    (We MUST pass the original image index)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu") # Use CPU for randperm
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

# Get the corresponding image index (0-24999) for each caption
img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

# --- Create datasets ---
tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

# --- Dataloaders now include the image index ---
train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (for validation)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    """
    Evaluates Text-to-Image Recall@K using CSLS re-ranking.
    - tx_queries: (N_queries, 1024) tensor of text embeddings
    - im_database: (N_images, 1536) tensor of ALL unique image embeddings
    - query_img_indices: (N_queries,) tensor of GT image indices (0-24999)
    """
    model.eval()

    # --- 1. Predict caption embeddings ---
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0) # (N_queries, 1536)

    # --- 2. Compute Cosine Similarity Matrix ---
    sim = preds @ im_database.T # (N_queries, N_images)

    # --- 3. Compute CSLS re-ranking scores ---
    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True) # (N_queries, 1)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T # (1, N_images)

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d # (N_queries, N_images)

    # --- 4. Get Ground Truth (already passed in) ---
    gt = query_img_indices # (N_queries,)

    # --- 5. Calculate R@K ---
    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop
# ------------------------------------------------------
EPOCHS = 40
LR = 1e-4
SAVE_PATH = "correct_hard_negative_best.pth"

# Validation set for monitoring
val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique # Use ALL unique images as the database

model = ResidualTranslator(R.detach().clone()).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with CORRECT Hard-Negative Mining...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            # --- Forward pass ---
            y_pred = model(x_batch) # (B, 1536)

            # --- 1. Contrastive Loss (aligns pairs) ---
            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            # --- 2. CORRECT Hard-Negative Mining ---
            with torch.no_grad():
                # (B, B) matrix of pairwise similarities
                sims_no_tau = y_pred @ y_batch.T

                # (B, B) mask where True means it's a POSITIVE pair
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))

                # Mask out ALL positives (not just diagonal)
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))

                # The argmax is now guaranteed to be a TRUE negative
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]

            # --- 3. Triplet Loss (now correct) ---
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            # --- 4. Combine losses ---
            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        # --- Backward pass ---
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualTranslator(R.detach().clone()).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    # tx_test_t was already preprocessed
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_correct_hard_negative.csv", index=False)
print("\n✅ submission_correct_hard_negative.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training with CORRECT Hard-Negative Mining...



  with torch.cuda.amp.autocast():


Epoch 01: loss=1.8393 | R@1=0.1730  R@5=0.3746  R@10=0.4700  R@50=0.7056
✅ Best model saved (epoch 1, R@10=0.4700)




Epoch 02: loss=1.6837 | R@1=0.1834  R@5=0.3866  R@10=0.4880  R@50=0.7256
✅ Best model saved (epoch 2, R@10=0.4880)




Epoch 03: loss=1.6235 | R@1=0.1890  R@5=0.3900  R@10=0.4998  R@50=0.7356
✅ Best model saved (epoch 3, R@10=0.4998)




Epoch 04: loss=1.5871 | R@1=0.1950  R@5=0.3972  R@10=0.5024  R@50=0.7396
✅ Best model saved (epoch 4, R@10=0.5024)




Epoch 05: loss=1.5612 | R@1=0.1966  R@5=0.3986  R@10=0.5090  R@50=0.7448
✅ Best model saved (epoch 5, R@10=0.5090)




Epoch 06: loss=1.5412 | R@1=0.1952  R@5=0.4036  R@10=0.5090  R@50=0.7472




Epoch 07: loss=1.5255 | R@1=0.1992  R@5=0.4074  R@10=0.5096  R@50=0.7504
✅ Best model saved (epoch 7, R@10=0.5096)




Epoch 08: loss=1.5129 | R@1=0.1964  R@5=0.4036  R@10=0.5148  R@50=0.7506
✅ Best model saved (epoch 8, R@10=0.5148)




Epoch 09: loss=1.5015 | R@1=0.1964  R@5=0.4078  R@10=0.5132  R@50=0.7508




Epoch 10: loss=1.4920 | R@1=0.1972  R@5=0.4106  R@10=0.5126  R@50=0.7488




Epoch 11: loss=1.4832 | R@1=0.1984  R@5=0.4110  R@10=0.5154  R@50=0.7494
✅ Best model saved (epoch 11, R@10=0.5154)




Epoch 12: loss=1.4768 | R@1=0.1998  R@5=0.4150  R@10=0.5162  R@50=0.7514
✅ Best model saved (epoch 12, R@10=0.5162)




Epoch 13: loss=1.4683 | R@1=0.1988  R@5=0.4126  R@10=0.5206  R@50=0.7518
✅ Best model saved (epoch 13, R@10=0.5206)




Epoch 14: loss=1.4626 | R@1=0.1976  R@5=0.4118  R@10=0.5162  R@50=0.7520




Epoch 15: loss=1.4580 | R@1=0.2010  R@5=0.4140  R@10=0.5204  R@50=0.7552




Epoch 16: loss=1.4530 | R@1=0.1982  R@5=0.4170  R@10=0.5202  R@50=0.7584




Epoch 17: loss=1.4488 | R@1=0.1970  R@5=0.4142  R@10=0.5226  R@50=0.7546
✅ Best model saved (epoch 17, R@10=0.5226)




Epoch 18: loss=1.4451 | R@1=0.1988  R@5=0.4128  R@10=0.5226  R@50=0.7562




Epoch 19: loss=1.4400 | R@1=0.1996  R@5=0.4174  R@10=0.5222  R@50=0.7558




Epoch 20: loss=1.4368 | R@1=0.2004  R@5=0.4156  R@10=0.5212  R@50=0.7542




Epoch 21: loss=1.4342 | R@1=0.1994  R@5=0.4164  R@10=0.5210  R@50=0.7564




Epoch 22: loss=1.4316 | R@1=0.2006  R@5=0.4180  R@10=0.5208  R@50=0.7590




Epoch 23: loss=1.4289 | R@1=0.2030  R@5=0.4170  R@10=0.5212  R@50=0.7554




Epoch 24: loss=1.4255 | R@1=0.1998  R@5=0.4172  R@10=0.5208  R@50=0.7580




Epoch 25: loss=1.4240 | R@1=0.2020  R@5=0.4168  R@10=0.5198  R@50=0.7578




Epoch 26: loss=1.4217 | R@1=0.1990  R@5=0.4170  R@10=0.5236  R@50=0.7584
✅ Best model saved (epoch 26, R@10=0.5236)




Epoch 27: loss=1.4195 | R@1=0.1996  R@5=0.4186  R@10=0.5224  R@50=0.7598




Epoch 28: loss=1.4175 | R@1=0.1994  R@5=0.4168  R@10=0.5226  R@50=0.7580




Epoch 29: loss=1.4159 | R@1=0.1996  R@5=0.4190  R@10=0.5242  R@50=0.7594
✅ Best model saved (epoch 29, R@10=0.5242)




Epoch 30: loss=1.4148 | R@1=0.1982  R@5=0.4178  R@10=0.5232  R@50=0.7596




Epoch 31: loss=1.4136 | R@1=0.1998  R@5=0.4194  R@10=0.5222  R@50=0.7590




Epoch 32: loss=1.4140 | R@1=0.1996  R@5=0.4190  R@10=0.5220  R@50=0.7580




Epoch 33: loss=1.4115 | R@1=0.1976  R@5=0.4196  R@10=0.5224  R@50=0.7594




Epoch 34: loss=1.4116 | R@1=0.1992  R@5=0.4194  R@10=0.5224  R@50=0.7594




Epoch 35: loss=1.4105 | R@1=0.1994  R@5=0.4198  R@10=0.5224  R@50=0.7590




Epoch 36: loss=1.4100 | R@1=0.1984  R@5=0.4194  R@10=0.5226  R@50=0.7590




Epoch 37: loss=1.4085 | R@1=0.1994  R@5=0.4194  R@10=0.5222  R@50=0.7582




Epoch 38: loss=1.4093 | R@1=0.1986  R@5=0.4186  R@10=0.5224  R@50=0.7584




Epoch 39: loss=1.4099 | R@1=0.1986  R@5=0.4188  R@10=0.5224  R@50=0.7586




Epoch 40: loss=1.4086 | R@1=0.1986  R@5=0.4188  R@10=0.5224  R@50=0.7584

🎯 Best R@10 on validation = 0.5242
Model saved as correct_hard_negative_best.pth
Loading best model from correct_hard_negative_best.pth for inference...
Best model loaded.

✅ submission_correct_hard_negative.csv saved successfully.


# Tuning to previous code and enhancement

In [None]:
# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. NEW MODEL: EnhancedResidualTranslator
#    (Deeper, more powerful residual block)
# ------------------------------------------------------
class EnhancedResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=2048, output_dim=1536, dropout=0.1):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Unchanged)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu") # Use CPU for randperm
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0) # (N_queries, 1536)

    sim = preds @ im_database.T # (N_queries, N_images)

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True) # (N_queries, 1)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T # (1, N_images)

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d # (N_queries, N_images)

    gt = query_img_indices # (N_queries,)

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (Unchanged, but with new model)
# ------------------------------------------------------
EPOCHS = 40
LR = 1e-4
SAVE_PATH = "enhanced_hard_negative_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize NEW model ---
model = EnhancedResidualTranslator(R.detach().clone(), hidden_dim=2048).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with Enhanced Model + Correct Hard-Negative Mining...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = EnhancedResidualTranslator(R.detach().clone(), hidden_dim=2048).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_enhanced_hard_negative.csv", index=False)
print("\n✅ submission_enhanced_hard_negative.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training with Enhanced Model + Correct Hard-Negative Mining...



  with torch.cuda.amp.autocast():


Epoch 01: loss=2.1814 | R@1=0.1082  R@5=0.2682  R@10=0.3684  R@50=0.6446
✅ Best model saved (epoch 1, R@10=0.3684)




Epoch 02: loss=1.7374 | R@1=0.1358  R@5=0.3156  R@10=0.4212  R@50=0.6828
✅ Best model saved (epoch 2, R@10=0.4212)




Epoch 03: loss=1.5760 | R@1=0.1534  R@5=0.3452  R@10=0.4548  R@50=0.7062
✅ Best model saved (epoch 3, R@10=0.4548)




Epoch 04: loss=1.4609 | R@1=0.1690  R@5=0.3628  R@10=0.4750  R@50=0.7256
✅ Best model saved (epoch 4, R@10=0.4750)




Epoch 05: loss=1.3669 | R@1=0.1742  R@5=0.3856  R@10=0.4906  R@50=0.7392
✅ Best model saved (epoch 5, R@10=0.4906)




Epoch 06: loss=1.2881 | R@1=0.1876  R@5=0.3962  R@10=0.5082  R@50=0.7496
✅ Best model saved (epoch 6, R@10=0.5082)




Epoch 07: loss=1.2187 | R@1=0.1978  R@5=0.4060  R@10=0.5104  R@50=0.7612
✅ Best model saved (epoch 7, R@10=0.5104)




Epoch 08: loss=1.1550 | R@1=0.2050  R@5=0.4184  R@10=0.5274  R@50=0.7686
✅ Best model saved (epoch 8, R@10=0.5274)




Epoch 09: loss=1.0961 | R@1=0.2146  R@5=0.4224  R@10=0.5278  R@50=0.7692
✅ Best model saved (epoch 9, R@10=0.5278)




Epoch 10: loss=1.0405 | R@1=0.2170  R@5=0.4334  R@10=0.5330  R@50=0.7752
✅ Best model saved (epoch 10, R@10=0.5330)




Epoch 11: loss=0.9926 | R@1=0.2292  R@5=0.4370  R@10=0.5428  R@50=0.7726
✅ Best model saved (epoch 11, R@10=0.5428)




Epoch 12: loss=0.9457 | R@1=0.2266  R@5=0.4416  R@10=0.5458  R@50=0.7780
✅ Best model saved (epoch 12, R@10=0.5458)




Epoch 13: loss=0.9024 | R@1=0.2278  R@5=0.4456  R@10=0.5496  R@50=0.7824
✅ Best model saved (epoch 13, R@10=0.5496)




Epoch 14: loss=0.8637 | R@1=0.2254  R@5=0.4432  R@10=0.5490  R@50=0.7774




Epoch 15: loss=0.8275 | R@1=0.2300  R@5=0.4460  R@10=0.5508  R@50=0.7786
✅ Best model saved (epoch 15, R@10=0.5508)




Epoch 16: loss=0.7962 | R@1=0.2338  R@5=0.4524  R@10=0.5592  R@50=0.7782
✅ Best model saved (epoch 16, R@10=0.5592)




Epoch 17: loss=0.7651 | R@1=0.2356  R@5=0.4516  R@10=0.5568  R@50=0.7802




Epoch 18: loss=0.7386 | R@1=0.2404  R@5=0.4576  R@10=0.5574  R@50=0.7784




Epoch 19: loss=0.7150 | R@1=0.2406  R@5=0.4572  R@10=0.5538  R@50=0.7784




Epoch 20: loss=0.6920 | R@1=0.2426  R@5=0.4558  R@10=0.5580  R@50=0.7758




Epoch 21: loss=0.6726 | R@1=0.2426  R@5=0.4556  R@10=0.5616  R@50=0.7782
✅ Best model saved (epoch 21, R@10=0.5616)




Epoch 22: loss=0.6544 | R@1=0.2384  R@5=0.4560  R@10=0.5572  R@50=0.7748




Epoch 23: loss=0.6378 | R@1=0.2422  R@5=0.4568  R@10=0.5578  R@50=0.7742




Epoch 24: loss=0.6224 | R@1=0.2418  R@5=0.4590  R@10=0.5604  R@50=0.7758




Epoch 25: loss=0.6093 | R@1=0.2434  R@5=0.4552  R@10=0.5592  R@50=0.7702




Epoch 26: loss=0.5981 | R@1=0.2448  R@5=0.4538  R@10=0.5568  R@50=0.7694




Epoch 27: loss=0.5870 | R@1=0.2416  R@5=0.4548  R@10=0.5564  R@50=0.7676




Epoch 28: loss=0.5769 | R@1=0.2430  R@5=0.4554  R@10=0.5586  R@50=0.7712




Epoch 29: loss=0.5673 | R@1=0.2444  R@5=0.4580  R@10=0.5554  R@50=0.7688




Epoch 30: loss=0.5601 | R@1=0.2428  R@5=0.4602  R@10=0.5582  R@50=0.7678




Epoch 31: loss=0.5531 | R@1=0.2428  R@5=0.4584  R@10=0.5582  R@50=0.7630




Epoch 32: loss=0.5468 | R@1=0.2452  R@5=0.4564  R@10=0.5594  R@50=0.7666




Epoch 33: loss=0.5441 | R@1=0.2426  R@5=0.4538  R@10=0.5596  R@50=0.7672




Epoch 34: loss=0.5391 | R@1=0.2446  R@5=0.4574  R@10=0.5596  R@50=0.7652




Epoch 35: loss=0.5346 | R@1=0.2434  R@5=0.4568  R@10=0.5578  R@50=0.7652




Epoch 36: loss=0.5315 | R@1=0.2444  R@5=0.4560  R@10=0.5586  R@50=0.7650




Epoch 37: loss=0.5302 | R@1=0.2436  R@5=0.4558  R@10=0.5578  R@50=0.7648




Epoch 38: loss=0.5287 | R@1=0.2440  R@5=0.4564  R@10=0.5584  R@50=0.7642




Epoch 39: loss=0.5279 | R@1=0.2440  R@5=0.4560  R@10=0.5572  R@50=0.7646




Epoch 40: loss=0.5281 | R@1=0.2444  R@5=0.4558  R@10=0.5572  R@50=0.7646

🎯 Best R@10 on validation = 0.5616
Model saved as enhanced_hard_negative_best.pth
Loading best model from enhanced_hard_negative_best.pth for inference...
Best model loaded.

✅ submission_enhanced_hard_negative.csv saved successfully.


# Patching Transformer

In [None]:
# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. NEW MODEL: ResidualPatchingTransformer
# ------------------------------------------------------
class ResidualPatchingTransformer(nn.Module):
    def __init__(self, R_init,
                 input_dim=1024,
                 output_dim=1536,
                 patch_size=64,
                 embed_dim=256,
                 depth=3,
                 nhead=8,
                 dropout=0.1):
        super().__init__()
        self.register_buffer("R", R_init)

        assert input_dim % patch_size == 0, "Input dim must be divisible by patch size"
        num_patches = input_dim // patch_size # 1024 // 64 = 16 patches

        # --- 1. Patching and Embedding ---
        # We use a single Linear layer to do both patching and embedding
        self.patch_embed = nn.Linear(input_dim, embed_dim * num_patches)
        self.pos_embed = nn.Parameter(torch.randn(1, num_patches, embed_dim) * 0.02)

        self.num_patches = num_patches
        self.embed_dim = embed_dim
        self.dropout = nn.Dropout(dropout)

        # --- 2. Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=nhead,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # --- 3. Output Projection ---
        self.norm = nn.LayerNorm(embed_dim)
        self.output_proj = nn.Linear(embed_dim * num_patches, output_dim)

    def forward(self, x):
        # 1. Base (Orthogonal) Path
        base = x @ self.R.T # (B, 1536)

        # 2. Residual (Transformer) Path

        # Patch and embed
        x_res = self.patch_embed(x) # (B, 1024) -> (B, 16*256)
        x_res = x_res.view(-1, self.num_patches, self.embed_dim) # (B, 16, 256)

        # Add positional embedding
        x_res = x_res + self.pos_embed # (B, 16, 256)
        x_res = self.dropout(x_res)

        # Pass through Transformer
        x_res = self.transformer(x_res) # (B, 16, 256)
        x_res = self.norm(x_res)

        # Flatten and project to output
        x_res = x_res.flatten(start_dim=1) # (B, 16*256)
        res = self.output_proj(x_res) # (B, 1536)

        # 3. Combine and Normalize
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Unchanged)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu") # Use CPU for randperm
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0) # (N_queries, 1536)

    sim = preds @ im_database.T # (N_queries, N_images)

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True) # (N_queries, 1)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T # (1, N_images)

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d # (N_queries, N_images)

    gt = query_img_indices # (N_queries,)

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (Unchanged, but with new model)
# ------------------------------------------------------
EPOCHS = 40
LR = 1e-4
SAVE_PATH = "patching_transformer_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize NEW model ---
model = ResidualPatchingTransformer(
    R_init=R.detach().clone(),
    patch_size=64,    # 1024 -> 16 patches
    embed_dim=256,    # Each patch becomes 256-dim
    depth=3,          # 3 Transformer layers
    nhead=8           # 8 attention heads
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with Patching Transformer + Correct Hard-Negative Mining...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)



 # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")
print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")


# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualPatchingTransformer(
    R_init=R.detach().clone(),
    patch_size=64, embed_dim=256, depth=3, nhead=8
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_patching_transformer.csv", index=False)
print("\n✅ submission_patching_transformer.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training with Patching Transformer + Correct Hard-Negative Mining...



  with torch.cuda.amp.autocast():


Epoch 01: loss=2.2878 | R@1=0.0802  R@5=0.2088  R@10=0.2994  R@50=0.5694
✅ Best model saved (epoch 1, R@10=0.2994)




Epoch 02: loss=1.8557 | R@1=0.1014  R@5=0.2558  R@10=0.3558  R@50=0.6164
✅ Best model saved (epoch 2, R@10=0.3558)




Epoch 03: loss=1.7173 | R@1=0.1112  R@5=0.2776  R@10=0.3772  R@50=0.6456
✅ Best model saved (epoch 3, R@10=0.3772)




Epoch 04: loss=1.6227 | R@1=0.1192  R@5=0.2940  R@10=0.3926  R@50=0.6594
✅ Best model saved (epoch 4, R@10=0.3926)




Epoch 05: loss=1.5503 | R@1=0.1262  R@5=0.3068  R@10=0.4102  R@50=0.6770
✅ Best model saved (epoch 5, R@10=0.4102)




Epoch 06: loss=1.4843 | R@1=0.1294  R@5=0.3174  R@10=0.4208  R@50=0.6836
✅ Best model saved (epoch 6, R@10=0.4208)




Epoch 07: loss=1.4300 | R@1=0.1380  R@5=0.3248  R@10=0.4350  R@50=0.6926
✅ Best model saved (epoch 7, R@10=0.4350)




Epoch 08: loss=1.3769 | R@1=0.1392  R@5=0.3294  R@10=0.4412  R@50=0.7040
✅ Best model saved (epoch 8, R@10=0.4412)




Epoch 09: loss=1.3276 | R@1=0.1426  R@5=0.3294  R@10=0.4394  R@50=0.7098




Epoch 10: loss=1.2791 | R@1=0.1482  R@5=0.3384  R@10=0.4460  R@50=0.7118
✅ Best model saved (epoch 10, R@10=0.4460)




Epoch 11: loss=1.2332 | R@1=0.1496  R@5=0.3402  R@10=0.4452  R@50=0.7156




Epoch 12: loss=1.1908 | R@1=0.1530  R@5=0.3384  R@10=0.4532  R@50=0.7136
✅ Best model saved (epoch 12, R@10=0.4532)




Epoch 13: loss=1.1490 | R@1=0.1484  R@5=0.3486  R@10=0.4572  R@50=0.7144
✅ Best model saved (epoch 13, R@10=0.4572)




Epoch 14: loss=1.1073 | R@1=0.1492  R@5=0.3444  R@10=0.4530  R@50=0.7164




Epoch 15: loss=1.0697 | R@1=0.1538  R@5=0.3488  R@10=0.4622  R@50=0.7100
✅ Best model saved (epoch 15, R@10=0.4622)




Epoch 16: loss=1.0320 | R@1=0.1514  R@5=0.3466  R@10=0.4578  R@50=0.7132




Epoch 17: loss=0.9992 | R@1=0.1586  R@5=0.3470  R@10=0.4564  R@50=0.7076




Epoch 18: loss=0.9693 | R@1=0.1556  R@5=0.3430  R@10=0.4544  R@50=0.7010




Epoch 19: loss=0.9385 | R@1=0.1546  R@5=0.3460  R@10=0.4520  R@50=0.7008




Epoch 20: loss=0.9106 | R@1=0.1534  R@5=0.3414  R@10=0.4516  R@50=0.7038




Epoch 21: loss=0.8867 | R@1=0.1564  R@5=0.3398  R@10=0.4464  R@50=0.6992




Epoch 22: loss=0.8628 | R@1=0.1536  R@5=0.3418  R@10=0.4470  R@50=0.7036




Epoch 23: loss=0.8406 | R@1=0.1554  R@5=0.3424  R@10=0.4426  R@50=0.6978




Epoch 24: loss=0.8215 | R@1=0.1532  R@5=0.3408  R@10=0.4490  R@50=0.6944




Epoch 25: loss=0.8037 | R@1=0.1558  R@5=0.3384  R@10=0.4488  R@50=0.6940




Epoch 26: loss=0.7872 | R@1=0.1568  R@5=0.3418  R@10=0.4448  R@50=0.6942




Epoch 27: loss=0.7745 | R@1=0.1574  R@5=0.3364  R@10=0.4476  R@50=0.6956




Epoch 28: loss=0.7593 | R@1=0.1556  R@5=0.3420  R@10=0.4416  R@50=0.6894




Epoch 29: loss=0.7486 | R@1=0.1564  R@5=0.3424  R@10=0.4394  R@50=0.6906




Epoch 30: loss=0.7385 | R@1=0.1548  R@5=0.3400  R@10=0.4440  R@50=0.6908




Epoch 31: loss=0.7281 | R@1=0.1532  R@5=0.3428  R@10=0.4424  R@50=0.6896




Epoch 32: loss=0.7210 | R@1=0.1540  R@5=0.3364  R@10=0.4426  R@50=0.6882




Epoch 33: loss=0.7153 | R@1=0.1538  R@5=0.3416  R@10=0.4394  R@50=0.6858




Epoch 34: loss=0.7080 | R@1=0.1516  R@5=0.3388  R@10=0.4400  R@50=0.6876




Epoch 35: loss=0.7048 | R@1=0.1538  R@5=0.3396  R@10=0.4398  R@50=0.6870




Epoch 36: loss=0.7007 | R@1=0.1546  R@5=0.3392  R@10=0.4404  R@50=0.6860




Epoch 37: loss=0.6981 | R@1=0.1536  R@5=0.3386  R@10=0.4396  R@50=0.6854




Epoch 38: loss=0.6946 | R@1=0.1532  R@5=0.3388  R@10=0.4400  R@50=0.6860




Epoch 39: loss=0.6947 | R@1=0.1546  R@5=0.3392  R@10=0.4404  R@50=0.6864




Epoch 40: loss=0.6928 | R@1=0.1546  R@5=0.3392  R@10=0.4402  R@50=0.6862

🎯 Best R@10 on validation = 0.4622
Model saved as patching_transformer_best.pth
Loading best model from patching_transformer_best.pth for inference...
Best model loaded.

✅ submission_patching_transformer.csv saved successfully.


# Relative Anchor Loss

In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. MODEL: EnhancedResidualTranslator
#    (Our best-performing model from the 0.5616 run)
# ------------------------------------------------------
class EnhancedResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=2048, output_dim=1536, dropout=0.1):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Hard-Negative + NEW Relative Loss)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_PRIMARY = 1.0     # (Contrastive + Triplet)
LOSS_WEIGHT_RELATIVE = 0.5    # NEW loss term

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

def relative_anchor_loss(x_batch, y_pred, anchors_x, anchors_y):
    """
    Forces the relative similarities to anchors to be the same.
    x_batch: (B, 1024), y_pred: (B, 1536)
    anchors_x: (K, 1024), anchors_y: (K, 1536)
    """
    # (B, K)
    sim_x_rel = x_batch @ anchors_x.T
    sim_y_rel = y_pred @ anchors_y.T

    # Force the two similarity vectors to be as close as possible
    return F.mse_loss(sim_x_rel, sim_y_rel)

# ------------------------------------------------------
# 6. Validation split & Anchor Selection
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu") # Use CPU for randperm
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# --- Create Anchors for the Relative Loss ---
# We use a fixed set of 1024 anchors from the validation set
NUM_ANCHORS = 1024
anchor_indices = torch.randperm(len(val_idx), device="cpu")[:NUM_ANCHORS]
anchors_x = tx_val_t[anchor_indices].detach() # (K, 1024)
anchors_y = im_val_t[anchor_indices].detach() # (K, 1536)
print(f"Created {NUM_ANCHORS} anchors for relative loss.")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0) # (N_queries, 1536)

    sim = preds @ im_database.T # (N_queries, N_images)

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True) # (N_queries, 1)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T # (1, N_images)

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d # (N_queries, N_images)

    gt = query_img_indices # (N_queries,)

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (with NEW loss)
# ------------------------------------------------------
EPOCHS = 50 # Train for longer
LR = 1e-4
SAVE_PATH = "relative_anchor_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize NEW model ---
model = EnhancedResidualTranslator(R.detach().clone(), hidden_dim=2048).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
# Scheduler now runs for 50 epochs
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with Enhanced Model + Relative Anchor Loss...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            # --- 1. Primary Loss (Hard-Negative) ---
            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss_primary = (0.7 * loss_con) + (0.3 * loss_tri)

            # --- 2. NEW Relative Anchor Loss ---
            loss_rel = relative_anchor_loss(x_batch, y_pred, anchors_x, anchors_y)

            # --- 3. Combine ---
            loss = (LOSS_WEIGHT_PRIMARY * loss_primary) + (LOSS_WEIGHT_RELATIVE * loss_rel)

        scaler.scale(loss).backward()
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = EnhancedResidualTranslator(R.detach().clone(), hidden_dim=2048).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_relative_anchor_loss.csv", index=False)
print("\n✅ submission_relative_anchor_loss.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500
Created 1024 anchors for relative loss.

Training with Enhanced Model + Relative Anchor Loss...



  with torch.cuda.amp.autocast():


Epoch 01: loss=2.5054 | R@1=0.0822  R@5=0.2190  R@10=0.2984  R@50=0.5540
✅ Best model saved (epoch 1, R@10=0.2984)




Epoch 02: loss=1.9838 | R@1=0.1040  R@5=0.2552  R@10=0.3498  R@50=0.6086
✅ Best model saved (epoch 2, R@10=0.3498)




Epoch 03: loss=1.8360 | R@1=0.1202  R@5=0.2782  R@10=0.3796  R@50=0.6468
✅ Best model saved (epoch 3, R@10=0.3796)




Epoch 04: loss=1.7429 | R@1=0.1300  R@5=0.2996  R@10=0.4024  R@50=0.6642
✅ Best model saved (epoch 4, R@10=0.4024)




Epoch 05: loss=1.6715 | R@1=0.1378  R@5=0.3082  R@10=0.4122  R@50=0.6800
✅ Best model saved (epoch 5, R@10=0.4122)




Epoch 06: loss=1.6152 | R@1=0.1468  R@5=0.3184  R@10=0.4238  R@50=0.6944
✅ Best model saved (epoch 6, R@10=0.4238)




Epoch 07: loss=1.5667 | R@1=0.1490  R@5=0.3306  R@10=0.4348  R@50=0.7044
✅ Best model saved (epoch 7, R@10=0.4348)




Epoch 08: loss=1.5256 | R@1=0.1552  R@5=0.3416  R@10=0.4460  R@50=0.7118
✅ Best model saved (epoch 8, R@10=0.4460)




Epoch 09: loss=1.4877 | R@1=0.1552  R@5=0.3474  R@10=0.4550  R@50=0.7188
✅ Best model saved (epoch 9, R@10=0.4550)




Epoch 10: loss=1.4524 | R@1=0.1590  R@5=0.3522  R@10=0.4626  R@50=0.7240
✅ Best model saved (epoch 10, R@10=0.4626)




Epoch 11: loss=1.4316 | R@1=0.1624  R@5=0.3594  R@10=0.4628  R@50=0.7264
✅ Best model saved (epoch 11, R@10=0.4628)




Epoch 12: loss=1.4151 | R@1=0.1638  R@5=0.3628  R@10=0.4716  R@50=0.7280
✅ Best model saved (epoch 12, R@10=0.4716)




Epoch 13: loss=1.3997 | R@1=0.1658  R@5=0.3616  R@10=0.4738  R@50=0.7310
✅ Best model saved (epoch 13, R@10=0.4738)




Epoch 14: loss=1.3855 | R@1=0.1682  R@5=0.3650  R@10=0.4760  R@50=0.7336
✅ Best model saved (epoch 14, R@10=0.4760)




Epoch 15: loss=1.3715 | R@1=0.1726  R@5=0.3706  R@10=0.4788  R@50=0.7342
✅ Best model saved (epoch 15, R@10=0.4788)




Epoch 16: loss=1.3569 | R@1=0.1722  R@5=0.3724  R@10=0.4852  R@50=0.7374
✅ Best model saved (epoch 16, R@10=0.4852)




Epoch 17: loss=1.3429 | R@1=0.1764  R@5=0.3760  R@10=0.4870  R@50=0.7390
✅ Best model saved (epoch 17, R@10=0.4870)




Epoch 18: loss=1.3315 | R@1=0.1784  R@5=0.3782  R@10=0.4868  R@50=0.7390




Epoch 19: loss=1.3178 | R@1=0.1796  R@5=0.3774  R@10=0.4880  R@50=0.7450
✅ Best model saved (epoch 19, R@10=0.4880)




Epoch 20: loss=1.3098 | R@1=0.1808  R@5=0.3796  R@10=0.4904  R@50=0.7436
✅ Best model saved (epoch 20, R@10=0.4904)




Epoch 21: loss=1.3035 | R@1=0.1794  R@5=0.3790  R@10=0.4918  R@50=0.7446
✅ Best model saved (epoch 21, R@10=0.4918)




Epoch 22: loss=1.2975 | R@1=0.1806  R@5=0.3786  R@10=0.4926  R@50=0.7444
✅ Best model saved (epoch 22, R@10=0.4926)




Epoch 23: loss=1.2918 | R@1=0.1810  R@5=0.3810  R@10=0.4930  R@50=0.7440
✅ Best model saved (epoch 23, R@10=0.4930)




Epoch 24: loss=1.2883 | R@1=0.1836  R@5=0.3832  R@10=0.4932  R@50=0.7452
✅ Best model saved (epoch 24, R@10=0.4932)




Epoch 25: loss=1.2834 | R@1=0.1828  R@5=0.3836  R@10=0.4944  R@50=0.7472
✅ Best model saved (epoch 25, R@10=0.4944)




Epoch 26: loss=1.2791 | R@1=0.1848  R@5=0.3844  R@10=0.4944  R@50=0.7474




Epoch 27: loss=1.2742 | R@1=0.1840  R@5=0.3848  R@10=0.4964  R@50=0.7476
✅ Best model saved (epoch 27, R@10=0.4964)




Epoch 28: loss=1.2690 | R@1=0.1852  R@5=0.3864  R@10=0.4958  R@50=0.7464




Epoch 29: loss=1.2662 | R@1=0.1864  R@5=0.3874  R@10=0.4964  R@50=0.7482




Epoch 30: loss=1.2641 | R@1=0.1864  R@5=0.3888  R@10=0.4968  R@50=0.7480
✅ Best model saved (epoch 30, R@10=0.4968)




Epoch 31: loss=1.2628 | R@1=0.1858  R@5=0.3880  R@10=0.4968  R@50=0.7482




Epoch 32: loss=1.2605 | R@1=0.1862  R@5=0.3882  R@10=0.4964  R@50=0.7462




Epoch 33: loss=1.2586 | R@1=0.1866  R@5=0.3878  R@10=0.4976  R@50=0.7478
✅ Best model saved (epoch 33, R@10=0.4976)




Epoch 34: loss=1.2591 | R@1=0.1868  R@5=0.3892  R@10=0.4990  R@50=0.7488
✅ Best model saved (epoch 34, R@10=0.4990)




Epoch 35: loss=1.2564 | R@1=0.1882  R@5=0.3886  R@10=0.4982  R@50=0.7490




Epoch 36: loss=1.2549 | R@1=0.1870  R@5=0.3888  R@10=0.4984  R@50=0.7494




Epoch 37: loss=1.2538 | R@1=0.1870  R@5=0.3882  R@10=0.4978  R@50=0.7494




Epoch 38: loss=1.2540 | R@1=0.1878  R@5=0.3880  R@10=0.4984  R@50=0.7496




Epoch 39: loss=1.2525 | R@1=0.1874  R@5=0.3882  R@10=0.4990  R@50=0.7492




Epoch 40: loss=1.2535 | R@1=0.1882  R@5=0.3888  R@10=0.4986  R@50=0.7502




Epoch 41: loss=1.2516 | R@1=0.1872  R@5=0.3886  R@10=0.4990  R@50=0.7500




Epoch 42: loss=1.2519 | R@1=0.1884  R@5=0.3890  R@10=0.4986  R@50=0.7498




Epoch 43: loss=1.2501 | R@1=0.1882  R@5=0.3892  R@10=0.4984  R@50=0.7500




Epoch 44: loss=1.2504 | R@1=0.1882  R@5=0.3890  R@10=0.4996  R@50=0.7502
✅ Best model saved (epoch 44, R@10=0.4996)




Epoch 45: loss=1.2510 | R@1=0.1884  R@5=0.3896  R@10=0.4998  R@50=0.7502
✅ Best model saved (epoch 45, R@10=0.4998)




Epoch 46: loss=1.2500 | R@1=0.1880  R@5=0.3894  R@10=0.4996  R@50=0.7502




Epoch 47: loss=1.2499 | R@1=0.1880  R@5=0.3892  R@10=0.4998  R@50=0.7502




Epoch 48: loss=1.2504 | R@1=0.1880  R@5=0.3892  R@10=0.4996  R@50=0.7504




Epoch 49: loss=1.2497 | R@1=0.1878  R@5=0.3892  R@10=0.4996  R@50=0.7504




Epoch 50: loss=1.2493 | R@1=0.1878  R@5=0.3892  R@10=0.4998  R@50=0.7504

🎯 Best R@10 on validation = 0.4998
Model saved as relative_anchor_best.pth
Loading best model from relative_anchor_best.pth for inference...
Best model loaded.

✅ submission_relative_anchor_loss.csv saved successfully.


#  Lightweight Transformer


In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. NEW MODEL: LightweightRegularizedTransformer
# ------------------------------------------------------
class LightweightRegularizedTransformer(nn.Module):
    def __init__(self, R_init,
                 input_dim=1024,
                 output_dim=1536,
                 patch_size=64,
                 embed_dim=256,
                 depth=2,  # SHALLOW: Only 2 layers
                 nhead=4,  # FEWER HEADS: 4 instead of 8
                 dropout=0.2): # HIGHER DROPOUT
        super().__init__()
        self.register_buffer("R", R_init)

        assert input_dim % patch_size == 0, "Input dim must be divisible by patch size"
        num_patches = input_dim // patch_size # 1024 // 64 = 16 patches

        # --- 1. Patching and Embedding ---
        self.patch_embed = nn.Linear(input_dim, embed_dim * num_patches)
        self.pos_embed = nn.Parameter(torch.randn(1, num_patches, embed_dim) * 0.02)

        self.num_patches = num_patches
        self.embed_dim = embed_dim
        self.dropout = nn.Dropout(dropout)

        # --- 2. Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=nhead,
            dim_feedforward=embed_dim * 2, # Smaller FFN
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # --- 3. Output Projection ---
        self.norm = nn.LayerNorm(embed_dim)
        self.output_proj = nn.Linear(embed_dim * num_patches, output_dim)

    def forward(self, x):
        # 1. Base (Orthogonal) Path
        base = x @ self.R.T # (B, 1536)

        # 2. Residual (Transformer) Path
        x_res = self.patch_embed(x)
        x_res = x_res.view(-1, self.num_patches, self.embed_dim)
        x_res = x_res + self.pos_embed
        x_res = self.dropout(x_res)
        x_res = self.transformer(x_res)
        x_res = self.norm(x_res)
        x_res = x_res.flatten(start_dim=1)
        res = self.output_proj(x_res)

        # 3. Combine and Normalize
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Our proven Hard-Negative setup)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0)

    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d

    gt = query_img_indices

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (with NEW model and HIGHER weight_decay)
# ------------------------------------------------------
EPOCHS = 40
LR = 1e-4
WEIGHT_DECAY = 1e-4 # STRONGER REGULARIZATION
SAVE_PATH = "lightweight_transformer_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize NEW model ---
model = LightweightRegularizedTransformer(
    R_init=R.detach().clone()
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with Lightweight Transformer (Regularized)...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient Clipping
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = LightweightRegularizedTransformer(
    R_init=R.detach().clone()
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024]
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_lightweight_transformer.csv", index=False)
print("\n✅ submission_lightweight_transformer.csv saved successfully.")

Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training with Lightweight Transformer (Regularized)...



  with torch.cuda.amp.autocast():


Epoch 01: loss=2.8066 | R@1=0.0436  R@5=0.1308  R@10=0.1994  R@50=0.4430
✅ Best model saved (epoch 1, R@10=0.1994)




Epoch 02: loss=2.2459 | R@1=0.0628  R@5=0.1722  R@10=0.2524  R@50=0.5132
✅ Best model saved (epoch 2, R@10=0.2524)




Epoch 03: loss=2.0901 | R@1=0.0742  R@5=0.1990  R@10=0.2838  R@50=0.5454
✅ Best model saved (epoch 3, R@10=0.2838)




Epoch 04: loss=1.9960 | R@1=0.0816  R@5=0.2178  R@10=0.3022  R@50=0.5664
✅ Best model saved (epoch 4, R@10=0.3022)




Epoch 05: loss=1.9285 | R@1=0.0862  R@5=0.2332  R@10=0.3224  R@50=0.5892
✅ Best model saved (epoch 5, R@10=0.3224)




Epoch 06: loss=1.8742 | R@1=0.0914  R@5=0.2388  R@10=0.3294  R@50=0.6040
✅ Best model saved (epoch 6, R@10=0.3294)




Epoch 07: loss=1.8277 | R@1=0.0952  R@5=0.2460  R@10=0.3378  R@50=0.6148
✅ Best model saved (epoch 7, R@10=0.3378)




Epoch 08: loss=1.7932 | R@1=0.0968  R@5=0.2532  R@10=0.3472  R@50=0.6276
✅ Best model saved (epoch 8, R@10=0.3472)




Epoch 09: loss=1.7592 | R@1=0.0982  R@5=0.2590  R@10=0.3516  R@50=0.6340
✅ Best model saved (epoch 9, R@10=0.3516)




Epoch 10: loss=1.7292 | R@1=0.0994  R@5=0.2614  R@10=0.3586  R@50=0.6396
✅ Best model saved (epoch 10, R@10=0.3586)




Epoch 11: loss=1.7155 | R@1=0.1028  R@5=0.2672  R@10=0.3628  R@50=0.6434
✅ Best model saved (epoch 11, R@10=0.3628)




Epoch 12: loss=1.7037 | R@1=0.1046  R@5=0.2688  R@10=0.3628  R@50=0.6490




Epoch 13: loss=1.6904 | R@1=0.1022  R@5=0.2690  R@10=0.3686  R@50=0.6510
✅ Best model saved (epoch 13, R@10=0.3686)




Epoch 14: loss=1.6787 | R@1=0.1058  R@5=0.2718  R@10=0.3700  R@50=0.6504
✅ Best model saved (epoch 14, R@10=0.3700)




Epoch 15: loss=1.6672 | R@1=0.1040  R@5=0.2778  R@10=0.3742  R@50=0.6548
✅ Best model saved (epoch 15, R@10=0.3742)




Epoch 16: loss=1.6582 | R@1=0.1088  R@5=0.2762  R@10=0.3748  R@50=0.6564
✅ Best model saved (epoch 16, R@10=0.3748)




Epoch 17: loss=1.6472 | R@1=0.1068  R@5=0.2782  R@10=0.3746  R@50=0.6578




Epoch 18: loss=1.6387 | R@1=0.1112  R@5=0.2780  R@10=0.3788  R@50=0.6604
✅ Best model saved (epoch 18, R@10=0.3788)




Epoch 19: loss=1.6309 | R@1=0.1092  R@5=0.2806  R@10=0.3824  R@50=0.6626
✅ Best model saved (epoch 19, R@10=0.3824)




Epoch 20: loss=1.6247 | R@1=0.1082  R@5=0.2814  R@10=0.3798  R@50=0.6656




Epoch 21: loss=1.6210 | R@1=0.1110  R@5=0.2782  R@10=0.3816  R@50=0.6648




Epoch 22: loss=1.6188 | R@1=0.1104  R@5=0.2800  R@10=0.3838  R@50=0.6660
✅ Best model saved (epoch 22, R@10=0.3838)




Epoch 23: loss=1.6128 | R@1=0.1100  R@5=0.2820  R@10=0.3830  R@50=0.6636




Epoch 24: loss=1.6102 | R@1=0.1104  R@5=0.2814  R@10=0.3856  R@50=0.6668
✅ Best model saved (epoch 24, R@10=0.3856)




Epoch 25: loss=1.6084 | R@1=0.1100  R@5=0.2818  R@10=0.3864  R@50=0.6670
✅ Best model saved (epoch 25, R@10=0.3864)




Epoch 26: loss=1.6059 | R@1=0.1106  R@5=0.2822  R@10=0.3860  R@50=0.6678




Epoch 27: loss=1.6054 | R@1=0.1108  R@5=0.2852  R@10=0.3896  R@50=0.6666
✅ Best model saved (epoch 27, R@10=0.3896)




Epoch 28: loss=1.6004 | R@1=0.1100  R@5=0.2828  R@10=0.3886  R@50=0.6668




Epoch 29: loss=1.5995 | R@1=0.1098  R@5=0.2830  R@10=0.3880  R@50=0.6686




Epoch 30: loss=1.5981 | R@1=0.1106  R@5=0.2840  R@10=0.3896  R@50=0.6680




Epoch 31: loss=1.5975 | R@1=0.1100  R@5=0.2840  R@10=0.3886  R@50=0.6682




Epoch 32: loss=1.5980 | R@1=0.1104  R@5=0.2848  R@10=0.3900  R@50=0.6676
✅ Best model saved (epoch 32, R@10=0.3900)




Epoch 33: loss=1.5972 | R@1=0.1104  R@5=0.2848  R@10=0.3898  R@50=0.6680




Epoch 34: loss=1.5972 | R@1=0.1106  R@5=0.2836  R@10=0.3900  R@50=0.6680




Epoch 35: loss=1.5967 | R@1=0.1100  R@5=0.2842  R@10=0.3890  R@50=0.6682




Epoch 36: loss=1.5962 | R@1=0.1106  R@5=0.2846  R@10=0.3898  R@50=0.6680




Epoch 37: loss=1.5952 | R@1=0.1104  R@5=0.2846  R@10=0.3900  R@50=0.6676




Epoch 38: loss=1.5963 | R@1=0.1104  R@5=0.2852  R@10=0.3902  R@50=0.6678
✅ Best model saved (epoch 38, R@10=0.3902)




Epoch 39: loss=1.5944 | R@1=0.1104  R@5=0.2854  R@10=0.3898  R@50=0.6678




Epoch 40: loss=1.5952 | R@1=0.1104  R@5=0.2852  R@10=0.3898  R@50=0.6678

🎯 Best R@10 on validation = 0.3902
Model saved as lightweight_transformer_best.pth
Loading best model from lightweight_transformer_best.pth for inference...
Best model loaded.

✅ submission_lightweight_transformer.csv saved successfully.


In [None]:
Epoch 50: loss=1.2493 | R@1=0.1878  R@5=0.3892  R@10=0.4998  R@50=0.7504

# Pure Contrasitive Loss

#

In [None]:
# ======================================================
#  AML Challenge — Residual-Orthogonal Translator v5
#  Fine-tuned version of pure-contrastive model
#  Adds: 0.1 Triplet term + tau=0.065 + CSLS inference
# ======================================================

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np, pandas as pd
import os, math
from google.colab import drive

# ------------------------------------------------------
# 0. Setup Google Drive
# ------------------------------------------------------
print("Mounting Google Drive...")
drive.mount('/content/drive', force_remount=True)

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.makedirs(BASE_DIR, exist_ok=True)
os.chdir(BASE_DIR)
print(f"Current working directory: {os.getcwd()}")

# ------------------------------------------------------
# 1. Device + Data Loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

# ------------------------------------------------------
# 2. Preprocessing
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test, dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)

tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. Model
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )
    def forward(self, x):
        base = x @ self.R.T
        res  = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss functions
# ------------------------------------------------------
TAU = 0.065

def pure_contrastive_loss(y_pred, y_batch, img_indices):
    sims = y_pred @ y_batch.T / TAU
    positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0)).float()
    log_probs = F.log_softmax(sims, dim=1) * positive_mask
    num_pos = torch.clamp(positive_mask.sum(1), min=1.0)
    loss = -log_probs.sum(1) / num_pos
    return loss.mean()

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.15
)

# ------------------------------------------------------
# 6. Validation split
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, ks=(1,5,10,50), k_csls=10):
    model.eval()
    preds = []
    for i in range(0, len(tx_queries), 1024):
        preds.append(model(tx_queries[i:i+1024].to(device)))
    preds = torch.cat(preds, dim=0)
    im_database = im_database.to(device)
    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values.mean(1, keepdim=True)
    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values.mean(1, keepdim=True).T
    csls_sim = 2 * sim - knn_q - knn_d

    gt = query_img_indices.to(device)
    ranks = torch.argsort(csls_sim, dim=1, descending=True)
    recalls = {}
    for k in ks:
        match = (ranks[:, :k] == gt.unsqueeze(1)).any(dim=1)
        recalls[f"R@{k}"] = match.float().mean().item()
    return recalls

# ------------------------------------------------------
# 8. Training
# ------------------------------------------------------
EPOCHS = 45
LR = 1e-4
SAVE_PATH = "residual_v5_best.pth"

val_subset_queries = tx_val_t[:5000]
val_subset_indices = img_indices_val[:5000]
val_db_subset = im_train_t_unique

model = ResidualTranslator(R.detach().clone()).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler()

best_r10 = 0.0
print("\nTraining Residual-Orthogonal Translator v5 (Contrastive + small Triplet)...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for x_batch, y_batch, img_idx in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)
            loss_con = pure_contrastive_loss(y_pred, y_batch, img_idx)
            # create negatives by random permutation
            neg_idx = torch.randperm(y_batch.size(0), device=device)
            y_neg = y_batch[neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_neg)
            loss = 0.9 * loss_con + 0.1 * loss_tri
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    scheduler.step()

    avg_loss = total_loss / len(train_loader)
    rec = recall_at_k(model, val_subset_queries, val_db_subset, val_subset_indices)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f} R@5={rec['R@5']:.4f} R@10={rec['R@10']:.4f} R@50={rec['R@50']:.4f}")
    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model updated (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best validation R@10 = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + CSLS Submission
# ------------------------------------------------------
@torch.no_grad()
def apply_csls(preds, im_base, k=10):
    preds = F.normalize(torch.tensor(preds, device=device), p=2, dim=1)
    im_base = F.normalize(im_base, p=2, dim=1)
    sim = preds @ im_base.T
    knn_q = torch.topk(sim, k=k, dim=1).values.mean(1, keepdim=True)
    knn_d = torch.topk(sim, k=k, dim=0).values.mean(0, keepdim=True)
    return (2 * sim - knn_q - knn_d).cpu().numpy()

print(f"\nLoading {SAVE_PATH} for inference...")
model = ResidualTranslator(R.detach().clone()).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_all = []
    for i in range(0, len(tx_test_t), 1024):
        preds_all.append(model(tx_test_t[i:i+1024].to(device)))
    preds = torch.cat(preds_all, dim=0).cpu().numpy()

csls_sim = apply_csls(preds, im_train_t_unique)
test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_residual_v5.csv", index=False)
print("\n✅ submission_residual_v5.csv saved successfully.")


Mounting Google Drive...


Exception ignored in: <function NpzFile.__del__ at 0x7e43f5d17a60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 226, in __del__
    self.close()
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 221, in close
    self.fid.close()
OSError: [Errno 107] Transport endpoint is not connected
Exception ignored in: <function NpzFile.__del__ at 0x7e43f5d17a60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 226, in __del__
    self.close()
  File "/usr/local/lib/python3.12/dist-packages/numpy/lib/_npyio_impl.py", line 221, in close
    self.fid.close()
OSError: [Errno 107] Transport endpoint is not connected


Mounted at /content/drive
Current working directory: /content/drive/MyDrive/AML Challenge
Using device: cuda
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler()


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training Residual-Orthogonal Translator v5 (Contrastive + small Triplet)...



  with torch.cuda.amp.autocast():


Epoch 01: loss=2.2875 | R@1=0.1460 R@5=0.3354 R@10=0.4256 R@50=0.6770
✅ Best model updated (epoch 1, R@10=0.4256)




Epoch 02: loss=2.1124 | R@1=0.1636 R@5=0.3568 R@10=0.4550 R@50=0.7050
✅ Best model updated (epoch 2, R@10=0.4550)




Epoch 03: loss=2.0351 | R@1=0.1726 R@5=0.3710 R@10=0.4720 R@50=0.7152
✅ Best model updated (epoch 3, R@10=0.4720)




Epoch 04: loss=1.9873 | R@1=0.1790 R@5=0.3778 R@10=0.4794 R@50=0.7256
✅ Best model updated (epoch 4, R@10=0.4794)




Epoch 05: loss=1.9501 | R@1=0.1804 R@5=0.3828 R@10=0.4866 R@50=0.7298
✅ Best model updated (epoch 5, R@10=0.4866)




Epoch 06: loss=1.9216 | R@1=0.1864 R@5=0.3880 R@10=0.4902 R@50=0.7362
✅ Best model updated (epoch 6, R@10=0.4902)




Epoch 07: loss=1.8984 | R@1=0.1880 R@5=0.3902 R@10=0.4982 R@50=0.7376
✅ Best model updated (epoch 7, R@10=0.4982)




Epoch 08: loss=1.8804 | R@1=0.1886 R@5=0.3972 R@10=0.4990 R@50=0.7408
✅ Best model updated (epoch 8, R@10=0.4990)




Epoch 09: loss=1.8635 | R@1=0.1874 R@5=0.3950 R@10=0.5020 R@50=0.7430
✅ Best model updated (epoch 9, R@10=0.5020)




Epoch 10: loss=1.8486 | R@1=0.1894 R@5=0.3990 R@10=0.5056 R@50=0.7414
✅ Best model updated (epoch 10, R@10=0.5056)




Epoch 11: loss=1.8417 | R@1=0.1894 R@5=0.4006 R@10=0.5054 R@50=0.7428




Epoch 12: loss=1.8351 | R@1=0.1910 R@5=0.4032 R@10=0.5076 R@50=0.7446
✅ Best model updated (epoch 12, R@10=0.5076)




Epoch 13: loss=1.8295 | R@1=0.1928 R@5=0.4024 R@10=0.5082 R@50=0.7436
✅ Best model updated (epoch 13, R@10=0.5082)




Epoch 14: loss=1.8239 | R@1=0.1916 R@5=0.4022 R@10=0.5076 R@50=0.7446




Epoch 15: loss=1.8189 | R@1=0.1922 R@5=0.4034 R@10=0.5084 R@50=0.7448
✅ Best model updated (epoch 15, R@10=0.5084)




Epoch 16: loss=1.8134 | R@1=0.1932 R@5=0.4024 R@10=0.5094 R@50=0.7448
✅ Best model updated (epoch 16, R@10=0.5094)




Epoch 17: loss=1.8083 | R@1=0.1926 R@5=0.4038 R@10=0.5072 R@50=0.7466




Epoch 18: loss=1.8045 | R@1=0.1922 R@5=0.4066 R@10=0.5102 R@50=0.7452
✅ Best model updated (epoch 18, R@10=0.5102)




Epoch 19: loss=1.7993 | R@1=0.1924 R@5=0.4064 R@10=0.5098 R@50=0.7444




Epoch 20: loss=1.7968 | R@1=0.1936 R@5=0.4072 R@10=0.5104 R@50=0.7444
✅ Best model updated (epoch 20, R@10=0.5104)




Epoch 21: loss=1.7953 | R@1=0.1950 R@5=0.4090 R@10=0.5104 R@50=0.7456




Epoch 22: loss=1.7937 | R@1=0.1942 R@5=0.4080 R@10=0.5096 R@50=0.7460




Epoch 23: loss=1.7919 | R@1=0.1952 R@5=0.4098 R@10=0.5116 R@50=0.7470
✅ Best model updated (epoch 23, R@10=0.5116)




Epoch 24: loss=1.7898 | R@1=0.1952 R@5=0.4102 R@10=0.5120 R@50=0.7462
✅ Best model updated (epoch 24, R@10=0.5120)




Epoch 25: loss=1.7887 | R@1=0.1952 R@5=0.4098 R@10=0.5126 R@50=0.7466
✅ Best model updated (epoch 25, R@10=0.5126)




Epoch 26: loss=1.7869 | R@1=0.1966 R@5=0.4102 R@10=0.5132 R@50=0.7462
✅ Best model updated (epoch 26, R@10=0.5132)




Epoch 27: loss=1.7845 | R@1=0.1952 R@5=0.4106 R@10=0.5132 R@50=0.7464




Epoch 28: loss=1.7844 | R@1=0.1954 R@5=0.4108 R@10=0.5126 R@50=0.7472




Epoch 29: loss=1.7832 | R@1=0.1958 R@5=0.4106 R@10=0.5122 R@50=0.7470




Epoch 30: loss=1.7832 | R@1=0.1960 R@5=0.4112 R@10=0.5138 R@50=0.7466
✅ Best model updated (epoch 30, R@10=0.5138)




Epoch 31: loss=1.7828 | R@1=0.1960 R@5=0.4112 R@10=0.5128 R@50=0.7470




Epoch 32: loss=1.7817 | R@1=0.1966 R@5=0.4114 R@10=0.5128 R@50=0.7468




Epoch 33: loss=1.7812 | R@1=0.1958 R@5=0.4114 R@10=0.5138 R@50=0.7468




Epoch 34: loss=1.7810 | R@1=0.1966 R@5=0.4116 R@10=0.5132 R@50=0.7466




Epoch 35: loss=1.7807 | R@1=0.1966 R@5=0.4118 R@10=0.5134 R@50=0.7468




Epoch 36: loss=1.7799 | R@1=0.1966 R@5=0.4110 R@10=0.5134 R@50=0.7470




Epoch 37: loss=1.7804 | R@1=0.1968 R@5=0.4116 R@10=0.5136 R@50=0.7466




Epoch 38: loss=1.7803 | R@1=0.1968 R@5=0.4118 R@10=0.5140 R@50=0.7470
✅ Best model updated (epoch 38, R@10=0.5140)




Epoch 39: loss=1.7808 | R@1=0.1960 R@5=0.4112 R@10=0.5138 R@50=0.7472




Epoch 40: loss=1.7803 | R@1=0.1966 R@5=0.4116 R@10=0.5140 R@50=0.7474




Epoch 41: loss=1.7804 | R@1=0.1968 R@5=0.4116 R@10=0.5144 R@50=0.7468
✅ Best model updated (epoch 41, R@10=0.5144)




Epoch 42: loss=1.7806 | R@1=0.1966 R@5=0.4116 R@10=0.5144 R@50=0.7470




Epoch 43: loss=1.7801 | R@1=0.1968 R@5=0.4116 R@10=0.5144 R@50=0.7470




Epoch 44: loss=1.7804 | R@1=0.1966 R@5=0.4116 R@10=0.5144 R@50=0.7470




Epoch 45: loss=1.7789 | R@1=0.1966 R@5=0.4116 R@10=0.5144 R@50=0.7470

🎯 Best validation R@10 = 0.5144
Model saved as residual_v5_best.pth

Loading residual_v5_best.pth for inference...
Best model loaded.

✅ submission_residual_v5.csv saved successfully.


# 120 Epoches

In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. MODEL: ResidualTranslator (Slightly Enhanced)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    # Our 0.815 winner used hidden_dim=1024
    # Our 0.796 loser used hidden_dim=2048
    # Let's try the middle ground: 1536
    def __init__(self, R_init, input_dim=1024, hidden_dim=1536, output_dim=1536, dropout=0.1):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim), # Add LayerNorm for stability
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Our proven Hard-Negative setup)
# ------------------------------------------------------
TAU = 0.07 # Temperature for contrastive loss
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0)

    im_database = im_database.to(device) # Ensure database is on device
    query_img_indices = query_img_indices.to(device) # Ensure indices are on device

    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d

    gt = query_img_indices

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (LONG HAUL)
# ------------------------------------------------------
EPOCHS = 120 # Train for much longer
LR = 1e-4
WEIGHT_DECAY = 5e-5 # Standard regularization
SAVE_PATH = "final_push_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize model with modest capacity boost ---
model = ResidualTranslator(
    R_init=R.detach().clone(),
    hidden_dim=1536 # 1024 was good, 2048 was bad. Let's try 1536.
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# Schedule the LR over the *entire* long run
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining Final Push Model (120 Epochs, hidden_dim=1536)...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            # --- Our proven winning loss combo ---
            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient Clipping
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    # Only validate every 2 epochs to save time
    if epoch % 2 == 0 or epoch == EPOCHS:
        rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
        print(f"Epoch {epoch:03d}: loss={avg_loss:.4f} | "
              f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
              f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

        if rec['R@10'] > best_r10:
            best_r10 = rec['R@10']
            torch.save(model.state_dict(), SAVE_PATH)
            print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualTranslator(
    R_init=R.detach().clone(),
    hidden_dim=1536
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_final_push.csv", index=False)
print("\n✅ submission_final_push.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training Final Push Model (120 Epochs, hidden_dim=1536)...



  with torch.cuda.amp.autocast():


Epoch 002: loss=2.0280 | R@1=0.1054  R@5=0.2632  R@10=0.3570  R@50=0.6082
✅ Best model saved (epoch 2, R@10=0.3570)




Epoch 004: loss=1.8083 | R@1=0.1284  R@5=0.3036  R@10=0.3938  R@50=0.6558
✅ Best model saved (epoch 4, R@10=0.3938)




Epoch 006: loss=1.7141 | R@1=0.1400  R@5=0.3240  R@10=0.4200  R@50=0.6780
✅ Best model saved (epoch 6, R@10=0.4200)




Epoch 008: loss=1.6553 | R@1=0.1430  R@5=0.3342  R@10=0.4358  R@50=0.6876
✅ Best model saved (epoch 8, R@10=0.4358)




Epoch 010: loss=1.6122 | R@1=0.1464  R@5=0.3410  R@10=0.4412  R@50=0.6964
✅ Best model saved (epoch 10, R@10=0.4412)




Epoch 012: loss=1.5923 | R@1=0.1502  R@5=0.3416  R@10=0.4466  R@50=0.7006
✅ Best model saved (epoch 12, R@10=0.4466)




Epoch 014: loss=1.5759 | R@1=0.1510  R@5=0.3516  R@10=0.4496  R@50=0.7026
✅ Best model saved (epoch 14, R@10=0.4496)




Epoch 016: loss=1.5610 | R@1=0.1522  R@5=0.3514  R@10=0.4542  R@50=0.7060
✅ Best model saved (epoch 16, R@10=0.4542)




Epoch 018: loss=1.5482 | R@1=0.1556  R@5=0.3532  R@10=0.4582  R@50=0.7090
✅ Best model saved (epoch 18, R@10=0.4582)




Epoch 020: loss=1.5349 | R@1=0.1558  R@5=0.3530  R@10=0.4594  R@50=0.7100
✅ Best model saved (epoch 20, R@10=0.4594)




Epoch 022: loss=1.5298 | R@1=0.1566  R@5=0.3544  R@10=0.4614  R@50=0.7114
✅ Best model saved (epoch 22, R@10=0.4614)




Epoch 024: loss=1.5233 | R@1=0.1576  R@5=0.3578  R@10=0.4632  R@50=0.7122
✅ Best model saved (epoch 24, R@10=0.4632)




Epoch 026: loss=1.5192 | R@1=0.1586  R@5=0.3550  R@10=0.4660  R@50=0.7122
✅ Best model saved (epoch 26, R@10=0.4660)




Epoch 028: loss=1.5122 | R@1=0.1578  R@5=0.3590  R@10=0.4672  R@50=0.7140
✅ Best model saved (epoch 28, R@10=0.4672)




Epoch 030: loss=1.5081 | R@1=0.1586  R@5=0.3590  R@10=0.4682  R@50=0.7152
✅ Best model saved (epoch 30, R@10=0.4682)




Epoch 032: loss=1.5060 | R@1=0.1590  R@5=0.3580  R@10=0.4682  R@50=0.7154




Epoch 034: loss=1.5027 | R@1=0.1586  R@5=0.3598  R@10=0.4684  R@50=0.7154
✅ Best model saved (epoch 34, R@10=0.4684)




Epoch 036: loss=1.5005 | R@1=0.1604  R@5=0.3602  R@10=0.4670  R@50=0.7156




Epoch 038: loss=1.4987 | R@1=0.1604  R@5=0.3640  R@10=0.4700  R@50=0.7150
✅ Best model saved (epoch 38, R@10=0.4700)




Epoch 040: loss=1.4968 | R@1=0.1602  R@5=0.3626  R@10=0.4682  R@50=0.7170




Epoch 042: loss=1.4941 | R@1=0.1612  R@5=0.3632  R@10=0.4694  R@50=0.7174




Epoch 044: loss=1.4915 | R@1=0.1610  R@5=0.3616  R@10=0.4702  R@50=0.7166
✅ Best model saved (epoch 44, R@10=0.4702)




Epoch 046: loss=1.4906 | R@1=0.1626  R@5=0.3624  R@10=0.4702  R@50=0.7174




Epoch 048: loss=1.4876 | R@1=0.1630  R@5=0.3632  R@10=0.4700  R@50=0.7168




Epoch 050: loss=1.4866 | R@1=0.1624  R@5=0.3646  R@10=0.4710  R@50=0.7162
✅ Best model saved (epoch 50, R@10=0.4710)




Epoch 052: loss=1.4846 | R@1=0.1624  R@5=0.3632  R@10=0.4706  R@50=0.7182




Epoch 054: loss=1.4845 | R@1=0.1646  R@5=0.3628  R@10=0.4702  R@50=0.7194




Epoch 056: loss=1.4823 | R@1=0.1640  R@5=0.3636  R@10=0.4724  R@50=0.7182
✅ Best model saved (epoch 56, R@10=0.4724)




Epoch 058: loss=1.4804 | R@1=0.1638  R@5=0.3638  R@10=0.4726  R@50=0.7188
✅ Best model saved (epoch 58, R@10=0.4726)




Epoch 060: loss=1.4787 | R@1=0.1638  R@5=0.3644  R@10=0.4718  R@50=0.7194




Epoch 062: loss=1.4770 | R@1=0.1642  R@5=0.3658  R@10=0.4714  R@50=0.7210




Epoch 064: loss=1.4758 | R@1=0.1648  R@5=0.3652  R@10=0.4732  R@50=0.7198
✅ Best model saved (epoch 64, R@10=0.4732)




Epoch 066: loss=1.4752 | R@1=0.1650  R@5=0.3658  R@10=0.4720  R@50=0.7198




Epoch 068: loss=1.4744 | R@1=0.1652  R@5=0.3664  R@10=0.4732  R@50=0.7200




Epoch 070: loss=1.4722 | R@1=0.1648  R@5=0.3664  R@10=0.4736  R@50=0.7196
✅ Best model saved (epoch 70, R@10=0.4736)




Epoch 072: loss=1.4720 | R@1=0.1644  R@5=0.3662  R@10=0.4740  R@50=0.7210
✅ Best model saved (epoch 72, R@10=0.4740)




Epoch 074: loss=1.4706 | R@1=0.1652  R@5=0.3656  R@10=0.4746  R@50=0.7208
✅ Best model saved (epoch 74, R@10=0.4746)




Epoch 076: loss=1.4698 | R@1=0.1646  R@5=0.3666  R@10=0.4738  R@50=0.7204




Epoch 078: loss=1.4696 | R@1=0.1654  R@5=0.3680  R@10=0.4732  R@50=0.7206




Epoch 080: loss=1.4685 | R@1=0.1670  R@5=0.3666  R@10=0.4744  R@50=0.7206




Epoch 082: loss=1.4681 | R@1=0.1656  R@5=0.3678  R@10=0.4736  R@50=0.7218




Epoch 084: loss=1.4677 | R@1=0.1660  R@5=0.3672  R@10=0.4736  R@50=0.7206




Epoch 086: loss=1.4670 | R@1=0.1666  R@5=0.3682  R@10=0.4746  R@50=0.7212




Epoch 088: loss=1.4664 | R@1=0.1672  R@5=0.3672  R@10=0.4748  R@50=0.7204
✅ Best model saved (epoch 88, R@10=0.4748)




Epoch 090: loss=1.4660 | R@1=0.1666  R@5=0.3672  R@10=0.4742  R@50=0.7204




Epoch 092: loss=1.4663 | R@1=0.1660  R@5=0.3680  R@10=0.4740  R@50=0.7208




Epoch 094: loss=1.4667 | R@1=0.1666  R@5=0.3688  R@10=0.4742  R@50=0.7214




Epoch 096: loss=1.4656 | R@1=0.1668  R@5=0.3690  R@10=0.4750  R@50=0.7208
✅ Best model saved (epoch 96, R@10=0.4750)




Epoch 098: loss=1.4652 | R@1=0.1666  R@5=0.3690  R@10=0.4744  R@50=0.7210




Epoch 100: loss=1.4647 | R@1=0.1666  R@5=0.3684  R@10=0.4746  R@50=0.7210




Epoch 102: loss=1.4650 | R@1=0.1662  R@5=0.3684  R@10=0.4748  R@50=0.7212




Epoch 104: loss=1.4639 | R@1=0.1662  R@5=0.3686  R@10=0.4754  R@50=0.7212
✅ Best model saved (epoch 104, R@10=0.4754)




Epoch 106: loss=1.4641 | R@1=0.1666  R@5=0.3686  R@10=0.4754  R@50=0.7206




Epoch 108: loss=1.4646 | R@1=0.1662  R@5=0.3688  R@10=0.4744  R@50=0.7210




Epoch 110: loss=1.4637 | R@1=0.1664  R@5=0.3694  R@10=0.4748  R@50=0.7210




Epoch 112: loss=1.4644 | R@1=0.1664  R@5=0.3692  R@10=0.4746  R@50=0.7212




Epoch 114: loss=1.4635 | R@1=0.1664  R@5=0.3692  R@10=0.4746  R@50=0.7214




Epoch 116: loss=1.4647 | R@1=0.1664  R@5=0.3692  R@10=0.4748  R@50=0.7212




Epoch 118: loss=1.4643 | R@1=0.1664  R@5=0.3692  R@10=0.4746  R@50=0.7212




Epoch 120: loss=1.4635 | R@1=0.1664  R@5=0.3692  R@10=0.4748  R@50=0.7212

🎯 Best R@10 on validation = 0.4754
Model saved as final_push_best.pth
Loading best model from final_push_best.pth for inference...
Best model loaded.

✅ submission_final_push.csv saved successfully.


# Fine TUning the original Residual model
# Highest score (0.82388) as per 10/30/2025

In [None]:
# ----------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. MODEL: ResidualTranslator (Our 0.815 Kaggle Winner)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Proven Hard-Negative, Tuned Hyperparams)
# ------------------------------------------------------
# --- HYPERPARAMETER TUNE ---
TAU = 0.05       # Lower temperature (was 0.07) -> Stricter contrastive loss
MARGIN = 0.25    # Higher margin (was 0.2) -> Stricter triplet loss
# -------------------------

LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=MARGIN
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0)

    im_database = im_database.to(device) # Ensure database is on device
    query_img_indices = query_img_indices.to(device) # Ensure indices are on device

    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d

    gt = query_img_indices

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (Original 40 epochs)
# ------------------------------------------------------
EPOCHS = 40 # Back to our 0.815 winner's epoch count
LR = 1e-4
WEIGHT_DECAY = 5e-5 # Standard regularization
SAVE_PATH = "hyperparam_tune_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize WINNING model ---
model = ResidualTranslator(
    R_init=R.detach().clone(),
    input_dim=1024,
    hidden_dim=1024, # Our 0.815 winner's capacity
    output_dim=1536
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
print(f"\nTraining with Tuned Hyperparameters (Tau={TAU}, Margin={MARGIN})...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            # --- Our proven winning loss combo ---
            sims_with_tau = y_pred @ y_batch.T / TAU # Using new TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg) # Using new MARGIN

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient Clipping
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | "
          f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
          f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

    if rec['R@10'] > best_r10:
        best_r10 = rec['R@10']
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualTranslator(
    R_init=R.detach().clone(),
    input_dim=1024,
    hidden_dim=1024,
    output_dim=1536
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_hyperparam_tune.csv", index=False)
print("\n✅ submission_hyperparam_tune.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500

Training with Tuned Hyperparameters (Tau=0.05, Margin=0.25)...



  with torch.cuda.amp.autocast():


Epoch 01: loss=1.5915 | R@1=0.1660  R@5=0.3586  R@10=0.4570  R@50=0.6866
✅ Best model saved (epoch 1, R@10=0.4570)




Epoch 02: loss=1.4447 | R@1=0.1792  R@5=0.3820  R@10=0.4810  R@50=0.7132
✅ Best model saved (epoch 2, R@10=0.4810)




Epoch 03: loss=1.3847 | R@1=0.1900  R@5=0.3940  R@10=0.4956  R@50=0.7306
✅ Best model saved (epoch 3, R@10=0.4956)




Epoch 04: loss=1.3445 | R@1=0.1994  R@5=0.4014  R@10=0.5080  R@50=0.7402
✅ Best model saved (epoch 4, R@10=0.5080)




Epoch 05: loss=1.3152 | R@1=0.2024  R@5=0.4076  R@10=0.5180  R@50=0.7454
✅ Best model saved (epoch 5, R@10=0.5180)




Epoch 06: loss=1.2917 | R@1=0.2078  R@5=0.4162  R@10=0.5214  R@50=0.7552
✅ Best model saved (epoch 6, R@10=0.5214)




Epoch 07: loss=1.2729 | R@1=0.2106  R@5=0.4208  R@10=0.5258  R@50=0.7566
✅ Best model saved (epoch 7, R@10=0.5258)




Epoch 08: loss=1.2574 | R@1=0.2122  R@5=0.4256  R@10=0.5278  R@50=0.7616
✅ Best model saved (epoch 8, R@10=0.5278)




Epoch 09: loss=1.2446 | R@1=0.2132  R@5=0.4276  R@10=0.5320  R@50=0.7660
✅ Best model saved (epoch 9, R@10=0.5320)




Epoch 10: loss=1.2321 | R@1=0.2160  R@5=0.4294  R@10=0.5314  R@50=0.7668




Epoch 11: loss=1.2264 | R@1=0.2168  R@5=0.4306  R@10=0.5340  R@50=0.7670
✅ Best model saved (epoch 11, R@10=0.5340)




Epoch 12: loss=1.2203 | R@1=0.2182  R@5=0.4304  R@10=0.5356  R@50=0.7674
✅ Best model saved (epoch 12, R@10=0.5356)




Epoch 13: loss=1.2165 | R@1=0.2176  R@5=0.4326  R@10=0.5360  R@50=0.7706
✅ Best model saved (epoch 13, R@10=0.5360)




Epoch 14: loss=1.2109 | R@1=0.2164  R@5=0.4322  R@10=0.5388  R@50=0.7692
✅ Best model saved (epoch 14, R@10=0.5388)




Epoch 15: loss=1.2063 | R@1=0.2198  R@5=0.4356  R@10=0.5382  R@50=0.7690




Epoch 16: loss=1.2026 | R@1=0.2180  R@5=0.4364  R@10=0.5378  R@50=0.7710




Epoch 17: loss=1.1990 | R@1=0.2210  R@5=0.4362  R@10=0.5400  R@50=0.7702
✅ Best model saved (epoch 17, R@10=0.5400)




Epoch 18: loss=1.1967 | R@1=0.2204  R@5=0.4358  R@10=0.5406  R@50=0.7696
✅ Best model saved (epoch 18, R@10=0.5406)




Epoch 19: loss=1.1926 | R@1=0.2200  R@5=0.4392  R@10=0.5406  R@50=0.7714




Epoch 20: loss=1.1906 | R@1=0.2218  R@5=0.4384  R@10=0.5418  R@50=0.7718
✅ Best model saved (epoch 20, R@10=0.5418)




Epoch 21: loss=1.1894 | R@1=0.2214  R@5=0.4390  R@10=0.5428  R@50=0.7710
✅ Best model saved (epoch 21, R@10=0.5428)




Epoch 22: loss=1.1872 | R@1=0.2210  R@5=0.4410  R@10=0.5430  R@50=0.7722
✅ Best model saved (epoch 22, R@10=0.5430)




Epoch 23: loss=1.1853 | R@1=0.2210  R@5=0.4410  R@10=0.5434  R@50=0.7728
✅ Best model saved (epoch 23, R@10=0.5434)




Epoch 24: loss=1.1850 | R@1=0.2214  R@5=0.4404  R@10=0.5428  R@50=0.7734




Epoch 25: loss=1.1835 | R@1=0.2206  R@5=0.4412  R@10=0.5432  R@50=0.7742




Epoch 26: loss=1.1830 | R@1=0.2202  R@5=0.4406  R@10=0.5436  R@50=0.7728
✅ Best model saved (epoch 26, R@10=0.5436)




Epoch 27: loss=1.1825 | R@1=0.2208  R@5=0.4424  R@10=0.5438  R@50=0.7736
✅ Best model saved (epoch 27, R@10=0.5438)




Epoch 28: loss=1.1817 | R@1=0.2218  R@5=0.4410  R@10=0.5438  R@50=0.7742




Epoch 29: loss=1.1805 | R@1=0.2214  R@5=0.4412  R@10=0.5436  R@50=0.7730




Epoch 30: loss=1.1803 | R@1=0.2216  R@5=0.4416  R@10=0.5452  R@50=0.7734
✅ Best model saved (epoch 30, R@10=0.5452)




Epoch 31: loss=1.1796 | R@1=0.2220  R@5=0.4418  R@10=0.5444  R@50=0.7734




Epoch 32: loss=1.1787 | R@1=0.2220  R@5=0.4412  R@10=0.5448  R@50=0.7738




Epoch 33: loss=1.1783 | R@1=0.2216  R@5=0.4416  R@10=0.5454  R@50=0.7740
✅ Best model saved (epoch 33, R@10=0.5454)




Epoch 34: loss=1.1789 | R@1=0.2220  R@5=0.4412  R@10=0.5452  R@50=0.7738




Epoch 35: loss=1.1790 | R@1=0.2222  R@5=0.4416  R@10=0.5450  R@50=0.7736




Epoch 36: loss=1.1786 | R@1=0.2220  R@5=0.4416  R@10=0.5454  R@50=0.7732




Epoch 37: loss=1.1779 | R@1=0.2220  R@5=0.4418  R@10=0.5452  R@50=0.7738




Epoch 38: loss=1.1766 | R@1=0.2220  R@5=0.4416  R@10=0.5450  R@50=0.7736




Epoch 39: loss=1.1772 | R@1=0.2218  R@5=0.4416  R@10=0.5452  R@50=0.7734




Epoch 40: loss=1.1783 | R@1=0.2218  R@5=0.4416  R@10=0.5452  R@50=0.7734

🎯 Best R@10 on validation = 0.5454
Model saved as hyperparam_tune_best.pth
Loading best model from hyperparam_tune_best.pth for inference...
Best model loaded.

✅ submission_hyperparam_tune.csv saved successfully.


## Fine tunning again

In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t_unique.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print(f"Computed orthogonal base R: {R.shape}")

# ------------------------------------------------------
# 4. MODEL: ResidualTranslator (Our 0.82388 Kaggle Winner)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Our 0.82388 Winning Hyperparams)
# ------------------------------------------------------
TAU = 0.05       # Our winning stricter temperature
MARGIN = 0.25    # Our winning stricter margin
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=MARGIN
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0)

    im_database = im_database.to(device) # Ensure database is on device
    query_img_indices = query_img_indices.to(device) # Ensure indices are on device

    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d

    gt = query_img_indices

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (THE "MARATHON" RUN)
# ------------------------------------------------------
EPOCHS = 150 # Let's give it a truly long run
LR = 1e-4
WEIGHT_DECAY = 5e-5
SAVE_PATH = "marathon_run_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize WINNING model ---
model = ResidualTranslator(
    R_init=R.detach().clone(),
    input_dim=1024,
    hidden_dim=1024, # Our 0.82388 winner's capacity
    output_dim=1536
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# Schedule the LR over the *entire* marathon
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

best_r10 = 0.0
# Load the weights from our 0.82388 winner to resume training
try:
    model.load_state_dict(torch.load("hyperparam_tune_best.pth"))
    best_r10 = 0.5454 # Start from our previous best
    print("✅ Loaded weights from 'hyperparam_tune_best.pth'. Resuming training.")
except Exception as e:
    print(f"Could not load previous weights, starting from scratch. Error: {e}")

print(f"\nTraining Final Marathon (Epochs: {EPOCHS}, Tau={TAU}, Margin={MARGIN})...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            # --- Our proven winning loss combo ---
            sims_with_tau = y_pred @ y_batch.T / TAU # Using new TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg) # Using new MARGIN

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient Clipping
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)

    # ----- Validation -----
    # Only validate every 2 epochs to save time
    if epoch % 2 == 0 or epoch == EPOCHS:
        rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
        print(f"Epoch {epoch:03d}: loss={avg_loss:.4f} | "
              f"R@1={rec['R@1']:.4f}  R@5={rec['R@5']:.4f}  "
              f"R@10={rec['R@10']:.4f}  R@50={rec['R@50']:.4f}")

        if rec['R@10'] > best_r10:
            best_r10 = rec['R@10']
            torch.save(model.state_dict(), SAVE_PATH)
            print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualTranslator(
    R_init=R.detach().clone(),
    input_dim=1024,
    hidden_dim=1024,
    output_dim=1536
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_text_t), 1024):
        chunk = tx_test_t[i:i+1024].to(device) # Ensure chunk is on device
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_marathon_run.csv", index=False)
print("\n✅ submission_marathon_run.csv saved successfully.")


Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)
Data preprocessed and normalized.


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Computed orthogonal base R: torch.Size([1536, 1024])
Train pairs: 112500, Validation pairs: 12500
✅ Loaded weights from 'hyperparam_tune_best.pth'. Resuming training.

Training Final Marathon (Epochs: 150, Tau=0.05, Margin=0.25)...



  with torch.cuda.amp.autocast():


Epoch 002: loss=1.1780 | R@1=0.2324  R@5=0.4612  R@10=0.5588  R@50=0.7840
✅ Best model saved (epoch 2, R@10=0.5588)




Epoch 004: loss=1.1646 | R@1=0.2340  R@5=0.4624  R@10=0.5608  R@50=0.7856
✅ Best model saved (epoch 4, R@10=0.5608)




Epoch 006: loss=1.1537 | R@1=0.2326  R@5=0.4614  R@10=0.5608  R@50=0.7838




Epoch 008: loss=1.1442 | R@1=0.2358  R@5=0.4622  R@10=0.5582  R@50=0.7826




Epoch 010: loss=1.1332 | R@1=0.2346  R@5=0.4616  R@10=0.5614  R@50=0.7864
✅ Best model saved (epoch 10, R@10=0.5614)




Epoch 012: loss=1.1250 | R@1=0.2356  R@5=0.4608  R@10=0.5628  R@50=0.7870
✅ Best model saved (epoch 12, R@10=0.5628)




Epoch 014: loss=1.1225 | R@1=0.2360  R@5=0.4634  R@10=0.5614  R@50=0.7872




Epoch 016: loss=1.1200 | R@1=0.2374  R@5=0.4628  R@10=0.5632  R@50=0.7866
✅ Best model saved (epoch 16, R@10=0.5632)




Epoch 018: loss=1.1141 | R@1=0.2356  R@5=0.4626  R@10=0.5624  R@50=0.7876




Epoch 020: loss=1.1094 | R@1=0.2368  R@5=0.4636  R@10=0.5634  R@50=0.7864
✅ Best model saved (epoch 20, R@10=0.5634)




Epoch 022: loss=1.1074 | R@1=0.2364  R@5=0.4658  R@10=0.5626  R@50=0.7882




Epoch 024: loss=1.1048 | R@1=0.2366  R@5=0.4640  R@10=0.5638  R@50=0.7880
✅ Best model saved (epoch 24, R@10=0.5638)




Epoch 026: loss=1.1031 | R@1=0.2374  R@5=0.4648  R@10=0.5640  R@50=0.7886
✅ Best model saved (epoch 26, R@10=0.5640)




Epoch 028: loss=1.1000 | R@1=0.2374  R@5=0.4654  R@10=0.5648  R@50=0.7876
✅ Best model saved (epoch 28, R@10=0.5648)




Epoch 030: loss=1.0999 | R@1=0.2382  R@5=0.4646  R@10=0.5662  R@50=0.7876
✅ Best model saved (epoch 30, R@10=0.5662)




Epoch 032: loss=1.0971 | R@1=0.2382  R@5=0.4642  R@10=0.5654  R@50=0.7884




Epoch 034: loss=1.0954 | R@1=0.2370  R@5=0.4648  R@10=0.5656  R@50=0.7874




Epoch 036: loss=1.0944 | R@1=0.2368  R@5=0.4648  R@10=0.5646  R@50=0.7882




Epoch 038: loss=1.0900 | R@1=0.2372  R@5=0.4642  R@10=0.5654  R@50=0.7884




Epoch 040: loss=1.0893 | R@1=0.2380  R@5=0.4642  R@10=0.5664  R@50=0.7882
✅ Best model saved (epoch 40, R@10=0.5664)




Epoch 042: loss=1.0897 | R@1=0.2380  R@5=0.4634  R@10=0.5658  R@50=0.7884




Epoch 044: loss=1.0880 | R@1=0.2384  R@5=0.4654  R@10=0.5636  R@50=0.7880




Epoch 046: loss=1.0878 | R@1=0.2374  R@5=0.4638  R@10=0.5646  R@50=0.7882




Epoch 048: loss=1.0874 | R@1=0.2388  R@5=0.4622  R@10=0.5634  R@50=0.7872




Epoch 050: loss=1.0846 | R@1=0.2372  R@5=0.4640  R@10=0.5650  R@50=0.7892




Epoch 052: loss=1.0836 | R@1=0.2384  R@5=0.4634  R@10=0.5652  R@50=0.7892




Epoch 054: loss=1.0821 | R@1=0.2372  R@5=0.4644  R@10=0.5634  R@50=0.7888




Epoch 056: loss=1.0805 | R@1=0.2376  R@5=0.4634  R@10=0.5648  R@50=0.7894




Epoch 058: loss=1.0788 | R@1=0.2392  R@5=0.4634  R@10=0.5654  R@50=0.7890




Epoch 060: loss=1.0787 | R@1=0.2392  R@5=0.4642  R@10=0.5666  R@50=0.7892
✅ Best model saved (epoch 60, R@10=0.5666)




Epoch 062: loss=1.0771 | R@1=0.2378  R@5=0.4618  R@10=0.5654  R@50=0.7912




Epoch 064: loss=1.0745 | R@1=0.2378  R@5=0.4638  R@10=0.5650  R@50=0.7902




Epoch 066: loss=1.0745 | R@1=0.2382  R@5=0.4640  R@10=0.5652  R@50=0.7892




Epoch 068: loss=1.0737 | R@1=0.2380  R@5=0.4634  R@10=0.5664  R@50=0.7906




Epoch 070: loss=1.0730 | R@1=0.2392  R@5=0.4640  R@10=0.5648  R@50=0.7902




Epoch 072: loss=1.0725 | R@1=0.2372  R@5=0.4632  R@10=0.5648  R@50=0.7900




Epoch 074: loss=1.0705 | R@1=0.2384  R@5=0.4622  R@10=0.5660  R@50=0.7912




Epoch 076: loss=1.0688 | R@1=0.2388  R@5=0.4634  R@10=0.5654  R@50=0.7910




Epoch 078: loss=1.0675 | R@1=0.2376  R@5=0.4622  R@10=0.5654  R@50=0.7906




Epoch 080: loss=1.0670 | R@1=0.2390  R@5=0.4640  R@10=0.5656  R@50=0.7910




Epoch 082: loss=1.0670 | R@1=0.2390  R@5=0.4628  R@10=0.5656  R@50=0.7914




Epoch 084: loss=1.0666 | R@1=0.2388  R@5=0.4640  R@10=0.5658  R@50=0.7906




Epoch 086: loss=1.0671 | R@1=0.2376  R@5=0.4638  R@10=0.5660  R@50=0.7922




Epoch 088: loss=1.0660 | R@1=0.2392  R@5=0.4618  R@10=0.5660  R@50=0.7910




Epoch 090: loss=1.0661 | R@1=0.2388  R@5=0.4640  R@10=0.5654  R@50=0.7914




Epoch 092: loss=1.0662 | R@1=0.2384  R@5=0.4642  R@10=0.5658  R@50=0.7912




Epoch 094: loss=1.0649 | R@1=0.2380  R@5=0.4642  R@10=0.5664  R@50=0.7918




Epoch 096: loss=1.0637 | R@1=0.2396  R@5=0.4644  R@10=0.5656  R@50=0.7908




Epoch 098: loss=1.0639 | R@1=0.2392  R@5=0.4656  R@10=0.5660  R@50=0.7902




Epoch 100: loss=1.0625 | R@1=0.2388  R@5=0.4638  R@10=0.5660  R@50=0.7908




Epoch 102: loss=1.0625 | R@1=0.2390  R@5=0.4634  R@10=0.5660  R@50=0.7910




Epoch 104: loss=1.0619 | R@1=0.2384  R@5=0.4644  R@10=0.5666  R@50=0.7908




Epoch 106: loss=1.0621 | R@1=0.2372  R@5=0.4648  R@10=0.5654  R@50=0.7910




Epoch 108: loss=1.0625 | R@1=0.2374  R@5=0.4640  R@10=0.5660  R@50=0.7916




Epoch 110: loss=1.0622 | R@1=0.2376  R@5=0.4648  R@10=0.5656  R@50=0.7920




Epoch 112: loss=1.0606 | R@1=0.2398  R@5=0.4636  R@10=0.5664  R@50=0.7912




Epoch 114: loss=1.0608 | R@1=0.2402  R@5=0.4642  R@10=0.5670  R@50=0.7908
✅ Best model saved (epoch 114, R@10=0.5670)




Epoch 116: loss=1.0609 | R@1=0.2400  R@5=0.4644  R@10=0.5662  R@50=0.7910




Epoch 118: loss=1.0606 | R@1=0.2394  R@5=0.4640  R@10=0.5672  R@50=0.7914
✅ Best model saved (epoch 118, R@10=0.5672)




Epoch 120: loss=1.0606 | R@1=0.2400  R@5=0.4652  R@10=0.5664  R@50=0.7922




Epoch 122: loss=1.0600 | R@1=0.2390  R@5=0.4642  R@10=0.5664  R@50=0.7912




Epoch 124: loss=1.0605 | R@1=0.2390  R@5=0.4644  R@10=0.5660  R@50=0.7910




Epoch 126: loss=1.0596 | R@1=0.2394  R@5=0.4644  R@10=0.5666  R@50=0.7912




Epoch 128: loss=1.0602 | R@1=0.2390  R@5=0.4654  R@10=0.5670  R@50=0.7916




Epoch 130: loss=1.0598 | R@1=0.2396  R@5=0.4646  R@10=0.5670  R@50=0.7914




Epoch 132: loss=1.0590 | R@1=0.2390  R@5=0.4644  R@10=0.5666  R@50=0.7912




Epoch 134: loss=1.0582 | R@1=0.2392  R@5=0.4650  R@10=0.5672  R@50=0.7912




Epoch 136: loss=1.0594 | R@1=0.2394  R@5=0.4646  R@10=0.5670  R@50=0.7912




Epoch 138: loss=1.0593 | R@1=0.2394  R@5=0.4640  R@10=0.5670  R@50=0.7912




Epoch 140: loss=1.0590 | R@1=0.2390  R@5=0.4648  R@10=0.5672  R@50=0.7912




Epoch 142: loss=1.0599 | R@1=0.2392  R@5=0.4648  R@10=0.5670  R@50=0.7914




Epoch 144: loss=1.0582 | R@1=0.2390  R@5=0.4644  R@10=0.5672  R@50=0.7914




Epoch 146: loss=1.0589 | R@1=0.2392  R@5=0.4648  R@10=0.5668  R@50=0.7914




Epoch 148: loss=1.0584 | R@1=0.2392  R@5=0.4648  R@10=0.5668  R@50=0.7914




Epoch 150: loss=1.0585 | R@1=0.2392  R@5=0.4648  R@10=0.5668  R@50=0.7914

🎯 Best R@10 on validation = 0.5672
Model saved as marathon_run_best.pth
Loading best model from marathon_run_best.pth for inference...
Best model loaded.


NameError: name 'tx_text_t' is not defined

In [None]:

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

test_data  = np.load("test.clean.npz")
tx_test  = test_data["captions/embeddings"]

# --- We only need the *mean* from the training set for normalization ---
# Load a small chunk to save RAM
train_data_captions = np.load("train.npz")["captions/embeddings"]
tx_mean = torch.as_tensor(train_data_captions.mean(0, keepdims=True), dtype=torch.float32, device=device)

# ------------------------------------------------------
# 2. Define the Model (MUST be identical to the one trained)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init_shape, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        # We don't need the R_init, just a buffer of the right shape
        self.register_buffer("R", torch.zeros(R_init_shape))
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 3. Load Data and Model
# ------------------------------------------------------
CHECKPOINT_PATH = "marathon_run_best.pth"
SUBMISSION_PATH = "submission_marathon_run.csv"

# --- Prepare test data ---
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)
print("Test data loaded and normalized.")

# --- Load the saved model ---
print(f"Loading best model from {CHECKPOINT_PATH} for inference...")
# We must pass a dummy R_init_shape. We know it's (1536, 1024)
model = ResidualTranslator(
    R_init_shape=(1536, 1024),
    input_dim=1024,
    hidden_dim=1024,
    output_dim=1536
).to(device)

model.load_state_dict(torch.load(CHECKPOINT_PATH))
model.eval()
print("Best model loaded.")

# ------------------------------------------------------
# 4. Inference + Submission (Corrected)
# ------------------------------------------------------
with torch.no_grad():
    preds_list = []
    # --- THIS IS THE FIX: tx_test_t (not tx_text_t) ---
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024].to(device)
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()
print("Inference complete.")

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"\n✅ {SUBMISSION_PATH} saved successfully.")


Using device: cuda
Test data loaded and normalized.
Loading best model from marathon_run_best.pth for inference...
Best model loaded.
Inference complete.

✅ submission_marathon_run.csv saved successfully.


# Previous 2 blocks belongs to one fine-tunned experiment

# New Fine-Tunned experiment
# got score as 0.83413

In [None]:
# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print(f"Train shapes: {tx_train.shape}, {im_train.shape}, {im_train_expanded.shape}")
print(f"Test shape: {tx_test.shape}")

# ------------------------------------------------------
# 2. Data preprocessing (Centering + Normalization)
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t_unique = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t_unique.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t_unique = F.normalize(im_train_t_unique - im_mean, p=2, dim=1)
tx_test_t = F.normalize(tx_test_t - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)
print("Data preprocessed and normalized.")

# ------------------------------------------------------
# 3. Orthogonal Procrustes base (R)
# ------------------------------------------------------
# We only need the shape to initialize the model class
R_SHAPE = (1536, 1024)
print(f"Base R shape: {R_SHAPE}")

# ------------------------------------------------------
# 4. MODEL: ResidualTranslator (Our 0.83359 Kaggle Winner)
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init_shape, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", torch.zeros(R_init_shape))
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 5. Loss Functions (Our Winning Hyperparams)
# ------------------------------------------------------
TAU = 0.05       # Our winning stricter temperature
MARGIN = 0.25    # Our winning stricter margin
LOSS_WEIGHT_CONTRASTIVE = 0.7
LOSS_WEIGHT_TRIPLET = 0.3

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=MARGIN
)

# ------------------------------------------------------
# 6. Validation split (Unchanged)
# ------------------------------------------------------
N = len(tx_train_t)
val_size = int(0.1 * N)
idx_cpu = torch.randperm(N, device="cpu")
val_idx, train_idx = idx_cpu[:val_size], idx_cpu[val_size:]

img_indices_train = (train_idx // 5).to(device)
img_indices_val = (val_idx // 5).to(device)

tx_val_t, im_val_t = tx_train_t[val_idx], im_train_exp[val_idx]
tx_train_t_sub, im_train_exp_sub = tx_train_t[train_idx], im_train_exp[train_idx]

train_dataset = TensorDataset(tx_train_t_sub, im_train_exp_sub, img_indices_train)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)
print(f"Train pairs: {len(train_dataset)}, Validation pairs: {len(val_idx)}")

# ------------------------------------------------------
# 7. Recall@K utility (Unchanged)
# ------------------------------------------------------
@torch.no_grad()
def recall_at_k(model, tx_queries, im_database, query_img_indices, repeat_factor=5, ks=(1, 5, 10, 50), k_csls=10):
    model.eval()
    preds_list = []
    # Process in chunks to avoid OOM on validation
    for i in range(0, len(tx_queries), 1024):
        chunk = tx_queries[i:i+1024].to(device)
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0)

    im_database = im_database.to(device)
    query_img_indices = query_img_indices.to(device)

    # CSLS calculation
    sim = preds @ im_database.T

    knn_q = torch.topk(sim, k=k_csls, dim=1).values
    mean_knn_q = knn_q.mean(1, keepdim=True)

    knn_d = torch.topk(sim.T, k=k_csls, dim=1).values
    mean_knn_d = knn_d.mean(1, keepdim=True).T

    csls_sim = 2 * sim - mean_knn_q - mean_knn_d

    gt = query_img_indices

    top_indices = torch.argsort(csls_sim, dim=1, descending=True)

    recalls = {}
    for k in ks:
        top_k_preds = top_indices[:, :k]
        correct_in_top_k = (top_k_preds == gt.unsqueeze(1)).any(dim=1)
        recall_at_k = correct_in_top_k.float().mean().item()
        recalls[f"R@{k}"] = recall_at_k

    return recalls

# ------------------------------------------------------
# 8. Training Loop (THE "FINAL FINE-TUNE" RUN)
# ------------------------------------------------------
EPOCHS = 50 # Max epochs for fine-tuning
START_LR = 1e-5 # Start with a low learning rate
WEIGHT_DECAY = 5e-5
LOAD_PATH = "marathon_run_best.pth"
SAVE_PATH = "final_finetune_best.pth" # New save path

val_query_subset = tx_val_t[:5000]
val_indices_subset = img_indices_val[:5000]
val_db_subset = im_train_t_unique

# --- Initialize WINNING model ---
model = ResidualTranslator(
    R_init_shape=R_SHAPE,
    input_dim=1024,
    hidden_dim=1024, # Our 0.83359 winner's capacity
    output_dim=1536
).to(device)

# --- Load our best-ever model ---
try:
    model.load_state_dict(torch.load(LOAD_PATH))
    print(f"✅ Loaded weights from '{LOAD_PATH}'. Starting fine-tune.")
except Exception as e:
    print(f"FATAL: Could not load '{LOAD_PATH}'. Stopping. Error: {e}")
    # Stop the script if we can't load the model
    raise e

best_r10 = 0.5672 # Start from our marathon-run's best score

optimizer = torch.optim.AdamW(model.parameters(), lr=START_LR, weight_decay=WEIGHT_DECAY)

# --- This is the new, smart scheduler ---
# It will monitor the R@10 score ('max' mode)
# 'patience=5': Wait 5 epochs for an improvement
# 'factor=0.5': If no improvement, cut LR in half
# 'min_lr=1e-7': Don't go lower than this
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 'max',
    patience=5,
    factor=0.5,
    min_lr=1e-7
    # verbose=True was removed, it is deprecated.
)
scaler = torch.cuda.amp.GradScaler() # Mixed precision

print(f"\nTraining Final Fine-Tune (Epochs: {EPOCHS}, Start LR={START_LR})...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch, img_indices in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            y_pred = model(x_batch)

            # --- Our proven winning loss combo ---
            sims_with_tau = y_pred @ y_batch.T / TAU
            labels = torch.arange(y_pred.size(0), device=device)
            loss_con = F.cross_entropy(sims_with_tau, labels)

            with torch.no_grad():
                sims_no_tau = y_pred @ y_batch.T
                positive_mask = (img_indices.unsqueeze(1) == img_indices.unsqueeze(0))
                sims_no_tau.masked_fill_(positive_mask, -float('inf'))
                hard_neg_idx = sims_no_tau.argmax(dim=1)

            y_hard_neg = y_batch[hard_neg_idx]
            loss_tri = triplet_loss_fn(y_pred, y_batch, y_hard_neg)

            loss = (LOSS_WEIGHT_CONTRASTIVE * loss_con) + (LOSS_WEIGHT_TRIPLET * loss_tri)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)

    # ----- Validation (MUST run every epoch for the scheduler) -----
    rec = recall_at_k(model, val_query_subset, val_db_subset, val_indices_subset)
    current_r10 = rec['R@10']

    # --- THIS IS THE CORRECTED PRINT STATEMENT ---
    print(f"Epoch {epoch:02d}: loss={avg_loss:.4f} | R@1(Val)={rec['R@1']:.4f}  R@5(Val)={rec['R@5']:.4f}  R@10(Val)={current_r10:.4f}  R@50(Val)={rec['R@50']:.4f}")

    if current_r10 > best_r10:
        best_r10 = current_r10
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"✅ Best model saved (epoch {epoch}, R@10={best_r10:.4f})")

    # Step the scheduler based on the R@10 score
    scheduler.step(current_r10)

    # Manually print the LR to replace 'verbose'
    current_lr = optimizer.param_groups[0]['lr']
    if 'last_lr' not in locals(): locals()['last_lr'] = START_LR
    if current_lr != locals()['last_lr']:
        print(f"--- Learning rate reduced to {current_lr:.1e} ---")
        locals()['last_lr'] = current_lr

    # Check if LR is at minimum (early stopping)
    if current_lr <= (1e-7 + 1e-9):
        print(f"Learning rate at minimum. Stopping training early at epoch {epoch}.")
        break

# --- This block is now DE-DENTED ---
print(f"\n🎯 Best R@10 on validation = {best_r10:.4f}\nModel saved as {SAVE_PATH}")

# ------------------------------------------------------
# 9. Inference + Submission
# ------------------------------------------------------
print(f"Loading best model from {SAVE_PATH} for inference...")
model = ResidualTranslator(
    R_init_shape=R_SHAPE,
    input_dim=1024,
    hidden_dim=1024,
    output_dim=1536
).to(device)
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()
print("Best model loaded.")

with torch.no_grad():
    preds_list = []
    for i in range(0, len(tx_test_t), 1024):
        chunk = tx_test_t[i:i+1024].to(device)
        preds_list.append(model(chunk))
    preds = torch.cat(preds_list, dim=0).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_final_finetune.csv", index=False)
print("\n✅ submission_final_finetune.csv saved successfully.")



Using device: cuda
Train shapes: (125000, 1024), (25000, 1536), (125000, 1536)
Test shape: (1500, 1024)


  scaler = torch.cuda.amp.GradScaler() # Mixed precision


Data preprocessed and normalized.
Base R shape: (1536, 1024)
Train pairs: 112500, Validation pairs: 12500
✅ Loaded weights from 'marathon_run_best.pth'. Starting fine-tune.

Training Final Fine-Tune (Epochs: 50, Start LR=1e-05)...



  with torch.cuda.amp.autocast():


Epoch 01: loss=1.0723 | R@1(Val)=0.2684  R@5(Val)=0.5124  R@10(Val)=0.6172  R@50(Val)=0.8304
✅ Best model saved (epoch 1, R@10=0.6172)




Epoch 02: loss=1.0722 | R@1(Val)=0.2684  R@5(Val)=0.5110  R@10(Val)=0.6166  R@50(Val)=0.8308




Epoch 03: loss=1.0714 | R@1(Val)=0.2678  R@5(Val)=0.5120  R@10(Val)=0.6162  R@50(Val)=0.8306




Epoch 04: loss=1.0712 | R@1(Val)=0.2680  R@5(Val)=0.5110  R@10(Val)=0.6146  R@50(Val)=0.8298




Epoch 05: loss=1.0706 | R@1(Val)=0.2682  R@5(Val)=0.5106  R@10(Val)=0.6142  R@50(Val)=0.8304




Epoch 06: loss=1.0707 | R@1(Val)=0.2680  R@5(Val)=0.5100  R@10(Val)=0.6134  R@50(Val)=0.8304




Epoch 07: loss=1.0692 | R@1(Val)=0.2678  R@5(Val)=0.5108  R@10(Val)=0.6130  R@50(Val)=0.8298
--- Learning rate reduced to 5.0e-06 ---




Epoch 08: loss=1.0685 | R@1(Val)=0.2674  R@5(Val)=0.5098  R@10(Val)=0.6112  R@50(Val)=0.8298




Epoch 09: loss=1.0690 | R@1(Val)=0.2664  R@5(Val)=0.5096  R@10(Val)=0.6124  R@50(Val)=0.8302




Epoch 10: loss=1.0682 | R@1(Val)=0.2662  R@5(Val)=0.5092  R@10(Val)=0.6120  R@50(Val)=0.8298




Epoch 11: loss=1.0689 | R@1(Val)=0.2662  R@5(Val)=0.5094  R@10(Val)=0.6120  R@50(Val)=0.8300




Epoch 12: loss=1.0690 | R@1(Val)=0.2660  R@5(Val)=0.5088  R@10(Val)=0.6120  R@50(Val)=0.8296




Epoch 13: loss=1.0673 | R@1(Val)=0.2654  R@5(Val)=0.5090  R@10(Val)=0.6120  R@50(Val)=0.8298
--- Learning rate reduced to 2.5e-06 ---




Epoch 14: loss=1.0681 | R@1(Val)=0.2656  R@5(Val)=0.5090  R@10(Val)=0.6124  R@50(Val)=0.8298




Epoch 15: loss=1.0683 | R@1(Val)=0.2652  R@5(Val)=0.5082  R@10(Val)=0.6124  R@50(Val)=0.8298




Epoch 16: loss=1.0683 | R@1(Val)=0.2652  R@5(Val)=0.5084  R@10(Val)=0.6124  R@50(Val)=0.8296




Epoch 17: loss=1.0676 | R@1(Val)=0.2654  R@5(Val)=0.5086  R@10(Val)=0.6130  R@50(Val)=0.8294




Epoch 18: loss=1.0682 | R@1(Val)=0.2654  R@5(Val)=0.5086  R@10(Val)=0.6122  R@50(Val)=0.8290




Epoch 19: loss=1.0675 | R@1(Val)=0.2658  R@5(Val)=0.5080  R@10(Val)=0.6122  R@50(Val)=0.8296
--- Learning rate reduced to 1.3e-06 ---




Epoch 20: loss=1.0676 | R@1(Val)=0.2656  R@5(Val)=0.5086  R@10(Val)=0.6124  R@50(Val)=0.8296




Epoch 21: loss=1.0679 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6124  R@50(Val)=0.8296




Epoch 22: loss=1.0683 | R@1(Val)=0.2656  R@5(Val)=0.5086  R@10(Val)=0.6124  R@50(Val)=0.8296




Epoch 23: loss=1.0674 | R@1(Val)=0.2654  R@5(Val)=0.5084  R@10(Val)=0.6120  R@50(Val)=0.8296




Epoch 24: loss=1.0681 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6120  R@50(Val)=0.8296




Epoch 25: loss=1.0670 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6126  R@50(Val)=0.8292
--- Learning rate reduced to 6.3e-07 ---




Epoch 26: loss=1.0670 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6122  R@50(Val)=0.8294




Epoch 27: loss=1.0670 | R@1(Val)=0.2658  R@5(Val)=0.5084  R@10(Val)=0.6118  R@50(Val)=0.8292




Epoch 28: loss=1.0660 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6120  R@50(Val)=0.8294




Epoch 29: loss=1.0676 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6120  R@50(Val)=0.8292




Epoch 30: loss=1.0675 | R@1(Val)=0.2658  R@5(Val)=0.5084  R@10(Val)=0.6120  R@50(Val)=0.8294




Epoch 31: loss=1.0665 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6118  R@50(Val)=0.8294
--- Learning rate reduced to 3.1e-07 ---




Epoch 32: loss=1.0688 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6122  R@50(Val)=0.8294




Epoch 33: loss=1.0681 | R@1(Val)=0.2654  R@5(Val)=0.5084  R@10(Val)=0.6120  R@50(Val)=0.8294




Epoch 34: loss=1.0671 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6120  R@50(Val)=0.8294




Epoch 35: loss=1.0675 | R@1(Val)=0.2656  R@5(Val)=0.5084  R@10(Val)=0.6122  R@50(Val)=0.8294




Epoch 36: loss=1.0670 | R@1(Val)=0.2658  R@5(Val)=0.5084  R@10(Val)=0.6122  R@50(Val)=0.8294




Epoch 37: loss=1.0676 | R@1(Val)=0.2658  R@5(Val)=0.5080  R@10(Val)=0.6126  R@50(Val)=0.8292
--- Learning rate reduced to 1.6e-07 ---




Epoch 38: loss=1.0670 | R@1(Val)=0.2658  R@5(Val)=0.5080  R@10(Val)=0.6124  R@50(Val)=0.8292




Epoch 39: loss=1.0665 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6124  R@50(Val)=0.8294




Epoch 40: loss=1.0682 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6124  R@50(Val)=0.8294




Epoch 41: loss=1.0672 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6122  R@50(Val)=0.8294




Epoch 42: loss=1.0669 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6124  R@50(Val)=0.8294




Epoch 43: loss=1.0669 | R@1(Val)=0.2656  R@5(Val)=0.5082  R@10(Val)=0.6124  R@50(Val)=0.8294
--- Learning rate reduced to 1.0e-07 ---
Learning rate at minimum. Stopping training early at epoch 43.

🎯 Best R@10 on validation = 0.6172
Model saved as final_finetune_best.pth
Loading best model from final_finetune_best.pth for inference...
Best model loaded.

✅ submission_final_finetune.csv saved successfully.
