# Experiment 1

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.chdir(BASE_DIR)
print("Current working directory:", os.getcwd())


Current working directory: /content/drive/MyDrive/AML Challenge


In [None]:
!pip -q install torch torchvision torchaudio
!pip -q install faiss-cpu numpy pandas scikit-learn matplotlib tqdm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ======================================================
#   STEP 1: Setup and data loading
# ======================================================
import numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]   # (125000, 1024)
im_train = train_data["images/embeddings"]     # (25000, 1536)
tx_test  = test_data["captions/embeddings"]    # (1500, 1024)

print(tx_train.shape, im_train.shape, tx_test.shape)

# match 1 caption → 1 image
# every 5 captions share the same image index
repeat_factor = len(tx_train) // len(im_train)    # 5
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print("Expanded image embeddings:", im_train_expanded.shape)




(125000, 1024) (25000, 1536) (1500, 1024)
Expanded image embeddings: (125000, 1536)


In [None]:
# ======================================================
#   STEP 2: Dataset + model
# ======================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PairDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.from_numpy(x).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i], self.y[i]

train_ds = PairDataset(tx_train, im_train_expanded)
train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)

d_text, d_image = tx_train.shape[1], im_train.shape[1]

class Translator(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, 2048),
            nn.ReLU(),
            nn.Linear(2048, d_out)
        )
    def forward(self, x):
        return self.net(x)

model = Translator(d_text, d_image).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)


In [None]:
# ======================================================
#   STEP 3: Training loop (cosine loss)
# ======================================================
epochs = 30
for ep in range(epochs):
    model.train()
    total = 0
    for xb, yb in tqdm(train_dl, desc=f"Epoch {ep+1}/{epochs}"):
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = 1 - F.cosine_similarity(pred, yb).mean()
        opt.zero_grad(); loss.backward(); opt.step()
        total += loss.item() * xb.size(0)
    print(f"Epoch {ep+1}: loss={total/len(train_ds):.4f}")


Epoch 1/30: 100%|██████████| 123/123 [00:02<00:00, 50.72it/s]


Epoch 1: loss=0.1461


Epoch 2/30: 100%|██████████| 123/123 [00:02<00:00, 57.61it/s]


Epoch 2: loss=0.1448


Epoch 3/30: 100%|██████████| 123/123 [00:02<00:00, 57.58it/s]


Epoch 3: loss=0.1436


Epoch 4/30: 100%|██████████| 123/123 [00:02<00:00, 51.52it/s]


Epoch 4: loss=0.1424


Epoch 5/30: 100%|██████████| 123/123 [00:02<00:00, 52.02it/s]


Epoch 5: loss=0.1413


Epoch 6/30: 100%|██████████| 123/123 [00:02<00:00, 60.95it/s]


Epoch 6: loss=0.1401


Epoch 7/30: 100%|██████████| 123/123 [00:02<00:00, 56.82it/s]


Epoch 7: loss=0.1391


Epoch 8/30: 100%|██████████| 123/123 [00:02<00:00, 60.57it/s]


Epoch 8: loss=0.1382


Epoch 9/30: 100%|██████████| 123/123 [00:02<00:00, 61.42it/s]


Epoch 9: loss=0.1372


Epoch 10/30: 100%|██████████| 123/123 [00:02<00:00, 51.05it/s]


Epoch 10: loss=0.1364


Epoch 11/30: 100%|██████████| 123/123 [00:02<00:00, 46.68it/s]


Epoch 11: loss=0.1355


Epoch 12/30: 100%|██████████| 123/123 [00:02<00:00, 47.08it/s]


Epoch 12: loss=0.1347


Epoch 13/30: 100%|██████████| 123/123 [00:02<00:00, 56.89it/s]


Epoch 13: loss=0.1339


Epoch 14/30: 100%|██████████| 123/123 [00:02<00:00, 56.73it/s]


Epoch 14: loss=0.1332


Epoch 15/30: 100%|██████████| 123/123 [00:02<00:00, 55.51it/s]


Epoch 15: loss=0.1324


Epoch 16/30: 100%|██████████| 123/123 [00:02<00:00, 46.56it/s]


Epoch 16: loss=0.1318


Epoch 17/30: 100%|██████████| 123/123 [00:02<00:00, 55.32it/s]


Epoch 17: loss=0.1311


Epoch 18/30: 100%|██████████| 123/123 [00:02<00:00, 53.41it/s]


Epoch 18: loss=0.1304


Epoch 19/30: 100%|██████████| 123/123 [00:02<00:00, 56.89it/s]


Epoch 19: loss=0.1298


Epoch 20/30: 100%|██████████| 123/123 [00:02<00:00, 55.98it/s]


Epoch 20: loss=0.1293


Epoch 21/30: 100%|██████████| 123/123 [00:02<00:00, 48.88it/s]


Epoch 21: loss=0.1287


Epoch 22/30: 100%|██████████| 123/123 [00:02<00:00, 49.30it/s]


Epoch 22: loss=0.1281


Epoch 23/30: 100%|██████████| 123/123 [00:02<00:00, 57.07it/s]


Epoch 23: loss=0.1276


Epoch 24/30: 100%|██████████| 123/123 [00:02<00:00, 53.70it/s]


Epoch 24: loss=0.1271


Epoch 25/30: 100%|██████████| 123/123 [00:02<00:00, 56.24it/s]


Epoch 25: loss=0.1265


Epoch 26/30: 100%|██████████| 123/123 [00:02<00:00, 57.06it/s]


Epoch 26: loss=0.1261


Epoch 27/30: 100%|██████████| 123/123 [00:02<00:00, 48.51it/s]


Epoch 27: loss=0.1256


Epoch 28/30: 100%|██████████| 123/123 [00:02<00:00, 53.02it/s]


Epoch 28: loss=0.1251


Epoch 29/30: 100%|██████████| 123/123 [00:02<00:00, 57.13it/s]


Epoch 29: loss=0.1248


Epoch 30/30: 100%|██████████| 123/123 [00:02<00:00, 52.45it/s]

Epoch 30: loss=0.1244





In [None]:
# ======================================================
#   STEP 4: Generate predictions for test captions
# ======================================================
model.eval()
with torch.no_grad():
    preds = model(torch.from_numpy(tx_test).float().to(device))
    preds = F.normalize(preds, dim=1)           # normalize for cosine similarity
    preds = preds.cpu().numpy().astype("float32")

np.save("preds_test_image_space.npy", preds)
print("Saved predicted test embeddings:", preds.shape)


Saved predicted test embeddings: (1500, 1536)


In [None]:
# ======================================================
#   STEP 5: Optional – quick retrieval sanity check
# ======================================================
import faiss

gallery = im_train.astype("float32")
faiss.normalize_L2(gallery)

index = faiss.IndexFlatIP(gallery.shape[1])
index.add(gallery)

faiss.normalize_L2(preds)
D, I = index.search(preds, 5)
print("Sample top-5 image indices for first test caption:", I[0])


Sample top-5 image indices for first test caption: [ 3123 15463  9288 17203 24769]


In [None]:
# ======================================================
#   STEP 6: Build Kaggle-style submission file
# ======================================================
import faiss, pandas as pd, numpy as np, os, torch.nn.functional as F

# 1️⃣ Load gallery (train) and predicted test embeddings
gallery = im_train.astype("float32")
faiss.normalize_L2(gallery)                      # normalize for cosine similarity

preds = np.load("preds_test_image_space.npy").astype("float32")
faiss.normalize_L2(preds)

# 2️⃣ Create FAISS index
index = faiss.IndexFlatIP(gallery.shape[1])
index.add(gallery)

# 3️⃣ Search top-K most similar images for each test caption
TOP_K = 100
D, I = index.search(preds, TOP_K)   # I: (num_test, TOP_K)

print(f"Retrieved {TOP_K} nearest images for {len(I)} test captions")

# 4️⃣ Recover the corresponding image names (gallery IDs)
image_names = train_data["images/names"]  # same order as im_train
image_ids   = [os.path.splitext(n)[0] for n in image_names]  # drop .jpg

# 5️⃣ Test caption IDs (for the left column in submission)
test_ids = test_data["captions/ids"]

# 6️⃣ Build submission rows: each test_id with 100 image IDs space-separated
rows = []
for row_idx, test_id in enumerate(test_ids):
    retrieved_ids = " ".join(str(image_ids[idx]) for idx in I[row_idx])
    rows.append({"Id": int(test_id), "Predicted": retrieved_ids})

submission = pd.DataFrame(rows)
submission.rename(columns={"Id": "id", "Predicted": "preds"}, inplace=True)
submission.to_csv("submission.csv", index=False)
print("✅ Saved submission.csv at:", os.path.abspath("submission.csv"))
submission.head(3)



Retrieved 100 nearest images for 1500 test captions
✅ submission.csv created: /content/drive/MyDrive/AML Challenge/submission.csv


Unnamed: 0,id,embedding
0,,
1,,


In [None]:
# ======================================================
#  FINAL: Kaggle submission format (embedding JSON array)
# ======================================================
import pandas as pd
import numpy as np
import json
import os

# preds_test_image_space.npy → (num_test, D)
preds = np.load("preds_test_image_space.npy")

# Load the test caption IDs
test_ids = test_data["captions/ids"]

# Build submission rows
rows = []
for idx, test_id in enumerate(test_ids):
    embedding_json = json.dumps(preds[idx].tolist())  # convert numpy array → JSON string
    rows.append({"id": int(test_id), "embedding": embedding_json})

# Create DataFrame
submission = pd.DataFrame(rows, columns=["id", "embedding"])

# Save CSV
submission.to_csv("submission2.csv", index=False)
print("✅ submission.csv created:", os.path.abspath("submission.csv"))
submission.head(2)


✅ submission.csv created: /content/drive/MyDrive/AML Challenge/submission.csv


Unnamed: 0,id,embedding
0,1,"[0.03023066371679306, 0.009858203120529652, 0...."
1,2,"[0.022392412647604942, -0.0035140616819262505,..."


# Experiment 2

In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.chdir(BASE_DIR)
print("Current working directory:", os.getcwd())


# ======================================================
#   STEP 1: Setup and data loading
# ======================================================
import numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]   # (125000, 1024)
im_train = train_data["images/embeddings"]     # (25000, 1536)
tx_test  = test_data["captions/embeddings"]    # (1500, 1024)

print(tx_train.shape, im_train.shape, tx_test.shape)

# match 1 caption → 1 image
# every 5 captions share the same image index
repeat_factor = len(tx_train) // len(im_train)    # 5
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print("Expanded image embeddings:", im_train_expanded.shape)




Current working directory: /content/drive/MyDrive/AML Challenge
(125000, 1024) (25000, 1536) (1500, 1024)
Expanded image embeddings: (125000, 1536)


In [None]:
# ======================================================
#   STEP 2: Prepare Tensors and Dataloaders
# ======================================================

# Convert numpy arrays to torch tensors
tx_train = torch.tensor(tx_train, dtype=torch.float32)
im_train_expanded = torch.tensor(im_train_expanded, dtype=torch.float32)

# Normalize embeddings to unit sphere
tx_train = F.normalize(tx_train, p=2, dim=1)
im_train_expanded = F.normalize(im_train_expanded, p=2, dim=1)

# Create TensorDataset and DataLoader
batch_size = 500
train_ds = torch.utils.data.TensorDataset(tx_train, im_train_expanded)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


# ======================================================
#   STEP 3: Orthogonal Mapping + Contrastive Loss
# ======================================================

class OrthogonalMapping(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        # Initialize W orthogonally (via QR)
        W = torch.randn(out_dim, in_dim)
        W, _ = torch.linalg.qr(W)
        self.W = nn.Parameter(W)
        self.bias = nn.Parameter(torch.zeros(out_dim))  # allows affine shift

    def forward(self, x):
        return F.linear(x, self.W, self.bias)


# Instantiate model
model = OrthogonalMapping(in_dim=tx_train.shape[1], out_dim=im_train_expanded.shape[1]).to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training hyperparameters
temperature = 0.07
orth_lambda = 0.1
epochs = 10


# ======================================================
#   STEP 4 (FIXED): Training Loop
# ======================================================

temperature = 0.05
orth_lambda = 1e-6       # much smaller!
epochs = 50              # train longer since gradients are weaker
clip_grad_norm = 1

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for tx_batch, im_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        tx_batch, im_batch = tx_batch.to(device), im_batch.to(device)

        tx_mapped = F.normalize(model(tx_batch), p=2, dim=1)
        im_batch = F.normalize(im_batch, p=2, dim=1)

        # similarity matrix
        sims = tx_mapped @ im_batch.T   # values in [-1,1]
        logits = sims / temperature
        targets = torch.arange(sims.size(0), device=device)

        # contrastive loss (InfoNCE)
        loss_contrastive = F.cross_entropy(logits, targets)

        # orthogonality regularization (small weight!)
        W = model.W
        I = torch.eye(W.size(0), device=device)
        orth_penalty = torch.norm(W @ W.T - I, p='fro')**2

        loss = loss_contrastive + orth_lambda * orth_penalty

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {total_loss/len(train_loader):.4f}")



# ======================================================
#   STEP 5: Evaluation — Text→Image Retrieval
# ======================================================

# Normalize all test text embeddings
tx_test = torch.tensor(tx_test, dtype=torch.float32).to(device)
tx_test = F.normalize(tx_test, p=2, dim=1)
im_train_full = F.normalize(torch.tensor(im_train, dtype=torch.float32).to(device), p=2, dim=1)

model.eval()
with torch.no_grad():
    tx_test_mapped = F.normalize(model(tx_test), p=2, dim=1)
    sims = tx_test_mapped @ im_train_full.T  # (1500, 25000)

# Compute simple retrieval metric — Recall@K
def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    # image index for each caption = floor(idx/5)
    true_img_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    correct = (preds == true_img_idx.unsqueeze(1)).any(dim=1).float()
    return correct.mean().item()

for K in [1, 5, 10, 50]:
    print(f"Recall@{K}: {recall_at_k(sims, K):.4f}")


  tx_train = torch.tensor(tx_train, dtype=torch.float32)
  im_train_expanded = torch.tensor(im_train_expanded, dtype=torch.float32)


Using device: cuda


Epoch 1/50: 100%|██████████| 250/250 [00:02<00:00, 88.21it/s]


Epoch 1/50 | Avg Loss: 2.8782


Epoch 2/50: 100%|██████████| 250/250 [00:03<00:00, 82.67it/s]


Epoch 2/50 | Avg Loss: 2.4403


Epoch 3/50: 100%|██████████| 250/250 [00:02<00:00, 94.01it/s]


Epoch 3/50 | Avg Loss: 2.3454


Epoch 4/50: 100%|██████████| 250/250 [00:02<00:00, 92.35it/s]


Epoch 4/50 | Avg Loss: 2.2954


Epoch 5/50: 100%|██████████| 250/250 [00:02<00:00, 97.58it/s]


Epoch 5/50 | Avg Loss: 2.2673


Epoch 6/50: 100%|██████████| 250/250 [00:02<00:00, 90.29it/s]


Epoch 6/50 | Avg Loss: 2.2454


Epoch 7/50: 100%|██████████| 250/250 [00:03<00:00, 80.68it/s]


Epoch 7/50 | Avg Loss: 2.2309


Epoch 8/50: 100%|██████████| 250/250 [00:02<00:00, 96.22it/s]


Epoch 8/50 | Avg Loss: 2.2221


Epoch 9/50: 100%|██████████| 250/250 [00:02<00:00, 92.18it/s]


Epoch 9/50 | Avg Loss: 2.2131


Epoch 10/50: 100%|██████████| 250/250 [00:02<00:00, 91.62it/s]


Epoch 10/50 | Avg Loss: 2.2077


Epoch 11/50: 100%|██████████| 250/250 [00:02<00:00, 93.29it/s]


Epoch 11/50 | Avg Loss: 2.2009


Epoch 12/50: 100%|██████████| 250/250 [00:03<00:00, 81.25it/s]


Epoch 12/50 | Avg Loss: 2.1985


Epoch 13/50: 100%|██████████| 250/250 [00:02<00:00, 92.67it/s]


Epoch 13/50 | Avg Loss: 2.1942


Epoch 14/50: 100%|██████████| 250/250 [00:02<00:00, 96.73it/s]


Epoch 14/50 | Avg Loss: 2.1926


Epoch 15/50: 100%|██████████| 250/250 [00:02<00:00, 92.57it/s]


Epoch 15/50 | Avg Loss: 2.1889


Epoch 16/50: 100%|██████████| 250/250 [00:02<00:00, 90.34it/s]


Epoch 16/50 | Avg Loss: 2.1868


Epoch 17/50: 100%|██████████| 250/250 [00:02<00:00, 83.38it/s]


Epoch 17/50 | Avg Loss: 2.1868


Epoch 18/50: 100%|██████████| 250/250 [00:02<00:00, 91.60it/s]


Epoch 18/50 | Avg Loss: 2.1836


Epoch 19/50: 100%|██████████| 250/250 [00:02<00:00, 92.02it/s]


Epoch 19/50 | Avg Loss: 2.1836


Epoch 20/50: 100%|██████████| 250/250 [00:02<00:00, 97.22it/s]


Epoch 20/50 | Avg Loss: 2.1813


Epoch 21/50: 100%|██████████| 250/250 [00:02<00:00, 88.01it/s]


Epoch 21/50 | Avg Loss: 2.1825


Epoch 22/50: 100%|██████████| 250/250 [00:03<00:00, 80.57it/s]


Epoch 22/50 | Avg Loss: 2.1802


Epoch 23/50: 100%|██████████| 250/250 [00:02<00:00, 96.47it/s]


Epoch 23/50 | Avg Loss: 2.1794


Epoch 24/50: 100%|██████████| 250/250 [00:02<00:00, 91.23it/s]


Epoch 24/50 | Avg Loss: 2.1806


Epoch 25/50: 100%|██████████| 250/250 [00:02<00:00, 97.01it/s]


Epoch 25/50 | Avg Loss: 2.1779


Epoch 26/50: 100%|██████████| 250/250 [00:02<00:00, 85.23it/s]


Epoch 26/50 | Avg Loss: 2.1793


Epoch 27/50: 100%|██████████| 250/250 [00:02<00:00, 84.44it/s]


Epoch 27/50 | Avg Loss: 2.1779


Epoch 28/50: 100%|██████████| 250/250 [00:02<00:00, 97.05it/s]


Epoch 28/50 | Avg Loss: 2.1786


Epoch 29/50: 100%|██████████| 250/250 [00:02<00:00, 92.77it/s]


Epoch 29/50 | Avg Loss: 2.1768


Epoch 30/50: 100%|██████████| 250/250 [00:02<00:00, 98.06it/s]


Epoch 30/50 | Avg Loss: 2.1769


Epoch 31/50: 100%|██████████| 250/250 [00:02<00:00, 83.74it/s]


Epoch 31/50 | Avg Loss: 2.1780


Epoch 32/50: 100%|██████████| 250/250 [00:02<00:00, 85.84it/s]


Epoch 32/50 | Avg Loss: 2.1762


Epoch 33/50: 100%|██████████| 250/250 [00:02<00:00, 92.85it/s]


Epoch 33/50 | Avg Loss: 2.1768


Epoch 34/50: 100%|██████████| 250/250 [00:02<00:00, 97.22it/s]


Epoch 34/50 | Avg Loss: 2.1749


Epoch 35/50: 100%|██████████| 250/250 [00:02<00:00, 97.42it/s]


Epoch 35/50 | Avg Loss: 2.1757


Epoch 36/50: 100%|██████████| 250/250 [00:03<00:00, 77.47it/s]


Epoch 36/50 | Avg Loss: 2.1763


Epoch 37/50: 100%|██████████| 250/250 [00:02<00:00, 94.55it/s]


Epoch 37/50 | Avg Loss: 2.1744


Epoch 38/50: 100%|██████████| 250/250 [00:02<00:00, 92.76it/s]


Epoch 38/50 | Avg Loss: 2.1756


Epoch 39/50: 100%|██████████| 250/250 [00:02<00:00, 96.64it/s]


Epoch 39/50 | Avg Loss: 2.1744


Epoch 40/50: 100%|██████████| 250/250 [00:02<00:00, 96.45it/s]


Epoch 40/50 | Avg Loss: 2.1751


Epoch 41/50: 100%|██████████| 250/250 [00:03<00:00, 76.55it/s]


Epoch 41/50 | Avg Loss: 2.1760


Epoch 42/50: 100%|██████████| 250/250 [00:02<00:00, 92.43it/s]


Epoch 42/50 | Avg Loss: 2.1748


Epoch 43/50: 100%|██████████| 250/250 [00:02<00:00, 97.01it/s]


Epoch 43/50 | Avg Loss: 2.1761


Epoch 44/50: 100%|██████████| 250/250 [00:02<00:00, 97.05it/s]


Epoch 44/50 | Avg Loss: 2.1765


Epoch 45/50: 100%|██████████| 250/250 [00:02<00:00, 87.07it/s]


Epoch 45/50 | Avg Loss: 2.1743


Epoch 46/50: 100%|██████████| 250/250 [00:03<00:00, 82.53it/s]


Epoch 46/50 | Avg Loss: 2.1758


Epoch 47/50: 100%|██████████| 250/250 [00:02<00:00, 92.64it/s]


Epoch 47/50 | Avg Loss: 2.1769


Epoch 48/50: 100%|██████████| 250/250 [00:02<00:00, 96.93it/s]


Epoch 48/50 | Avg Loss: 2.1752


Epoch 49/50: 100%|██████████| 250/250 [00:02<00:00, 95.66it/s]


Epoch 49/50 | Avg Loss: 2.1748


Epoch 50/50: 100%|██████████| 250/250 [00:02<00:00, 83.70it/s]
  tx_test = torch.tensor(tx_test, dtype=torch.float32).to(device)


Epoch 50/50 | Avg Loss: 2.1735
Recall@1: 0.0000
Recall@5: 0.0013
Recall@10: 0.0013
Recall@50: 0.0067


# Experiment 3


In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.chdir(BASE_DIR)
print("Current working directory:", os.getcwd())


# ======================================================
#   STEP 1: Setup and data loading
# ======================================================
import numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]   # (125000, 1024)
im_train = train_data["images/embeddings"]     # (25000, 1536)
tx_test  = test_data["captions/embeddings"]    # (1500, 1024)

print(tx_train.shape, im_train.shape, tx_test.shape)

# match 1 caption → 1 image
# every 5 captions share the same image index
repeat_factor = len(tx_train) // len(im_train)    # 5
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)
print("Expanded image embeddings:", im_train_expanded.shape)




Current working directory: /content/drive/MyDrive/AML Challenge
(125000, 1024) (25000, 1536) (1500, 1024)
Expanded image embeddings: (125000, 1536)


In [None]:
# ======================================================
#   EXPERIMENT 3 — Projection MLP + Bidirectional CLIP Loss
# ======================================================
import torch, torch.nn as nn, torch.nn.functional as F
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Normalize and convert to torch tensors
tx_train = torch.as_tensor(tx_train, dtype=torch.float32)
im_train_expanded = torch.as_tensor(im_train_expanded, dtype=torch.float32)
tx_train = F.normalize(tx_train, p=2, dim=1)
im_train_expanded = F.normalize(im_train_expanded, p=2, dim=1)

# Dataloader
batch_size = 512
train_ds = torch.utils.data.TensorDataset(tx_train, im_train_expanded)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)

# ======================================================
#   Projection network (non-linear aligner)
# ======================================================
class ProjectionMLP(nn.Module):
    def __init__(self, in_dim, out_dim, hidden=1024):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, out_dim)
        )
    def forward(self, x):
        return self.net(x)

model = ProjectionMLP(in_dim=tx_train.shape[1], out_dim=im_train_expanded.shape[1]).to(device)

# ======================================================
#   CLIP-style bidirectional contrastive loss
# ======================================================
def clip_loss(tx_emb, im_emb, temperature=0.05):
    # cosine similarities scaled by temperature
    logits_per_text  = tx_emb @ im_emb.T / temperature
    logits_per_image = im_emb @ tx_emb.T / temperature
    targets = torch.arange(tx_emb.size(0), device=tx_emb.device)
    loss_t = F.cross_entropy(logits_per_text, targets)
    loss_i = F.cross_entropy(logits_per_image, targets)
    return (loss_t + loss_i) / 2


# ======================================================
#   Group-aware CLIP-style Loss  (5 captions per image)
# ======================================================
def group_clip_loss(tx_emb, im_emb, temperature=0.05, group_size=5):
    B = tx_emb.size(0)
    assert B % group_size == 0, "Batch must be multiple of group_size"
    num_imgs = B // group_size

    # average captions belonging to same image
    im_grouped = im_emb[::group_size]                     # one per image
    tx_grouped = tx_emb.view(num_imgs, group_size, -1).mean(1)

    logits_text_to_img = tx_grouped @ im_grouped.T / temperature
    logits_img_to_text = im_grouped @ tx_grouped.T / temperature
    targets = torch.arange(num_imgs, device=tx_emb.device)

    loss_t = F.cross_entropy(logits_text_to_img, targets)
    loss_i = F.cross_entropy(logits_img_to_text, targets)
    return (loss_t + loss_i) / 2



# ======================================================
#   Training
# ======================================================
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
epochs = 30
temperature = 0.05

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for tx_batch, im_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        tx_batch, im_batch = tx_batch.to(device), im_batch.to(device)
        tx_proj = F.normalize(model(tx_batch), p=2, dim=1)
        im_proj = F.normalize(im_batch, p=2, dim=1)
        loss = group_clip_loss(tx_proj, im_proj, temperature, group_size=5)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} | Avg Loss: {total_loss/len(train_loader):.4f}")

# ======================================================
#   Evaluation — Text→Image Retrieval
# ======================================================
tx_test_t = torch.as_tensor(tx_test, dtype=torch.float32).to(device)
tx_test_t = F.normalize(tx_test_t, p=2, dim=1)
im_train_t = F.normalize(torch.as_tensor(im_train, dtype=torch.float32).to(device), p=2, dim=1)

model.eval()
with torch.no_grad():
    tx_test_proj = F.normalize(model(tx_test_t), p=2, dim=1)
    sims = tx_test_proj @ im_train_t.T  # (1500, 25000)

def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_img_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    correct = (preds == true_img_idx.unsqueeze(1)).any(dim=1).float()
    return correct.mean().item()

for K in [1, 5, 10, 50]:
    print(f"Recall@{K}: {recall_at_k(sims, K):.4f}")


Using device: cuda


Epoch 1/30:   0%|          | 0/244 [00:00<?, ?it/s]


AssertionError: Batch must be multiple of group_size

In [None]:

# ======================================================
#   Save submission.csv for Kaggle
# ======================================================
import pandas as pd, numpy as np
top_indices = torch.argsort(sims, dim=1, descending=True).cpu().numpy()
preds_str = [" ".join(map(str, row)) for row in top_indices]
df_sub = pd.DataFrame({"Id": np.arange(len(preds_str)), "Predicted": preds_str})
df_sub.to_csv("submission.csv", index=False)
print("✅ Saved submission.csv")


# Experiments based on paper ideas

## EXPERIMENT 5A — Orthogonal Procrustes (Centroid Alignment)

In [None]:
import torch, torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# --- Build per-image text centroids (average of 5 captions) ---
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)   # (25000,1024)

# Normalize both sides
tx_c = F.normalize(tx_centroids, p=2, dim=1)
im_c = F.normalize(im_train_t,  p=2, dim=1)

# --- Orthogonal Procrustes: find rotation R (1536×1024) ---
M = im_c.T @ tx_c                     # (1536×1024)
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh                            # orthogonal map

# --- Apply to test captions ---
tx_test_t = torch.as_tensor(tx_test, dtype=torch.float32, device=device)
tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
mapped = F.normalize(tx_test_n @ R.T, p=2, dim=1)     # (1500×1536)
im_base = F.normalize(im_train_t, p=2, dim=1)

# --- Retrieval ---
sims = mapped @ im_base.T
def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

for K in [1,5,10,50]:
    print(f"Procrustes Recall@{K}: {recall_at_k(sims,K):.4f}")

Using device: cuda
Procrustes Recall@1: 0.0000
Procrustes Recall@5: 0.0007
Procrustes Recall@10: 0.0020
Procrustes Recall@50: 0.0093


# EXPERIMENT 5B — Whitening–Coloring Transform (Affine Alignment)

In [None]:
# ======================================================
#   EXPERIMENT 5B (fixed for unequal dims) — Rectangular WCT / CORAL
# ======================================================
import torch, torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)

# --- Build per-image text centroids ---
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)  # (25000,1024)

# Center both spaces
mu_tx = tx_centroids.mean(0, keepdim=True)
mu_im = im_train_t.mean(0, keepdim=True)
X = tx_centroids - mu_tx
Y = im_train_t  - mu_im

# --- Compute covariances ---
eps = 1e-5
Cx = (X.T @ X) / X.shape[0] + eps * torch.eye(X.shape[1], device=device)   # 1024×1024
Cy = (Y.T @ Y) / Y.shape[0] + eps * torch.eye(Y.shape[1], device=device)   # 1536×1536

# --- Compute sqrt inverses ---
Ex, Lx, _ = torch.linalg.svd(Cx, full_matrices=False)
Ey, Ly, _ = torch.linalg.svd(Cy, full_matrices=False)
Cx_m12 = Ex @ torch.diag(1.0 / torch.sqrt(Lx)) @ Ex.T      # 1024×1024
Cy_p12 = Ey @ torch.diag(torch.sqrt(Ly)) @ Ey.T            # 1536×1536

# --- Compute cross-covariance (rectangular 1536×1024) ---
Cxy = (Y.T @ X) / X.shape[0]

# --- Build rectangular W: 1536×1024 ---
W_wct = Cy_p12 @ Cxy @ Cx_m12
b_wct = mu_im - mu_tx @ W_wct.T

# --- Apply to test captions ---
tx_test_t = torch.as_tensor(tx_test, dtype=torch.float32, device=device)
mapped = F.normalize((tx_test_t - mu_tx) @ W_wct.T + b_wct, p=2, dim=1)
im_base = F.normalize(im_train_t, p=2, dim=1)
sims = mapped @ im_base.T

def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

for K in [1,5,10,50]:
    print(f"Rectangular-WCT Recall@{K}: {recall_at_k(sims,K):.4f}")


Using device: cuda
Rectangular-WCT Recall@1: 0.0007
Rectangular-WCT Recall@5: 0.0027
Rectangular-WCT Recall@10: 0.0040
Rectangular-WCT Recall@50: 0.0153


#  EXPERIMENT 5B — Whitening–Coloring Transform (Affine)

In [None]:
# ======================================================
#   EXPERIMENT 5B — Rectangular Whitening–Coloring Transform
# ======================================================
import torch, torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# Average 5 captions per image
tx_centroids = tx_train_t.view(-1,5,tx_train_t.shape[1]).mean(dim=1)

# Center
mu_tx = tx_centroids.mean(0, keepdim=True)
mu_im = im_train_t.mean(0, keepdim=True)
X = tx_centroids - mu_tx
Y = im_train_t - mu_im

# Compute rectangular cross-covariance
M = Y.T @ X / X.shape[0]     # (1536×1024)

# SVD decomposition of M
U, S, Vh = torch.linalg.svd(M, full_matrices=False)

# Optional scaling of singular values (acts like whitening/coloring)
S_clipped = torch.clamp(S, min=1e-5)
W = (U * S_clipped) @ Vh     # (1536×1024) mapping from text→image space
b = mu_im - mu_tx @ W.T      # bias term

# Apply to test set
mapped = F.normalize((tx_test_t - mu_tx) @ W.T + b, p=2, dim=1)
im_base = F.normalize(im_train_t, p=2, dim=1)
sims = mapped @ im_base.T

def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

for K in [1,5,10,50]:
    print(f"Rectangular-WCT Recall@{K}: {recall_at_k(sims,K):.4f}")


Using device: cuda
Rectangular-WCT Recall@1: 0.0000
Rectangular-WCT Recall@5: 0.0020
Rectangular-WCT Recall@10: 0.0033
Rectangular-WCT Recall@50: 0.0093


# EXPERIMENT 5C — Ridge Regression → Polar Projection

In [None]:
# ======================================================
#   EXPERIMENT 5C — Ridge Regression → Polar Projection
# ======================================================
import torch, torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# --- Build per-image text centroids ---
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
X = F.normalize(tx_centroids, p=2, dim=1)
Y = F.normalize(im_train_t,  p=2, dim=1)

# --- Ridge closed-form solution (1536×1024) ---
lam = 1e-1
A = (Y.T @ X) @ torch.linalg.inv(X.T @ X + lam * torch.eye(X.shape[1], device=device))

# --- Project to nearest orthogonal via SVD ---
U, _, Vh = torch.linalg.svd(A, full_matrices=False)
W_polar = U @ Vh  # (1536×1024)

# --- Apply to test set ---
tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
mapped = F.normalize(tx_test_n @ W_polar.T, p=2, dim=1)
im_base = F.normalize(im_train_t, p=2, dim=1)
sims = mapped @ im_base.T

def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

for K in [1,5,10,50]:
    print(f"Ridge→Polar Recall@{K}: {recall_at_k(sims,K):.4f}")


Using device: cuda
Ridge→Polar Recall@1: 0.0000
Ridge→Polar Recall@5: 0.0013
Ridge→Polar Recall@10: 0.0027
Ridge→Polar Recall@50: 0.0133


## EXPERIMENT 5D — Relative Representations (RR Anchors)

In [None]:
# ======================================================
#   EXPERIMENT 5D — Relative Representations (RR Anchors)
# ======================================================
import torch, torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# --- Prepare normalized embeddings ---
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# Average 5 captions per image
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)

# Normalize
tx_c = F.normalize(tx_centroids, p=2, dim=1)
im_c = F.normalize(im_train_t,  p=2, dim=1)
tx_test_n = F.normalize(tx_test_t, p=2, dim=1)

# --- Choose m anchors (images) ---
m = 2000
idx = torch.randperm(im_c.size(0), device=device)[:m]
A_img = im_c[idx]        # (m,1536)
A_txt = tx_c[idx]        # corresponding text anchors (m,1024)

# --- Define RR projections ---
def rr_image(Z):
    Z = F.normalize(Z, p=2, dim=1)
    return Z @ A_img.T     # (N, m)

def rr_text(Z):
    Z = F.normalize(Z, p=2, dim=1)
    return Z @ A_txt.T     # (N, m)

# RR representations
RR_im_train = rr_image(im_c)
RR_tx_test  = rr_text(tx_test_n)

# Normalize and compare
RR_im_train_n = F.normalize(RR_im_train, p=2, dim=1)
RR_tx_test_n  = F.normalize(RR_tx_test,  p=2, dim=1)

sims = RR_tx_test_n @ RR_im_train_n.T

def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

for K in [1,5,10,50]:
    print(f"RR Recall@{K}: {recall_at_k(sims,K):.4f}")


Using device: cuda
RR Recall@1: 0.0000
RR Recall@5: 0.0000
RR Recall@10: 0.0000
RR Recall@50: 0.0107


#  EXPERIMENT 5E — Cycle-Consistent Two-Head Translator

In [None]:
# ======================================================
#   EXPERIMENT 5G — Cycle-Consistent Deep MLP Translator
# ======================================================

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# --- Prepare data ---
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# --- Preprocessing Fixes: Normalize and Center ---
tx_train_t = F.normalize(tx_train_t - tx_train_t.mean(0, keepdim=True), p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_train_t.mean(0, keepdim=True), p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_train_t.mean(0, keepdim=True), p=2, dim=1)

# Average 5 captions per image
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
tx_c = F.normalize(tx_centroids, p=2, dim=1)
im_c = F.normalize(im_train_t,  p=2, dim=1)

# --- Define deeper MLP translators ---
f_ti = nn.Sequential(
    nn.Linear(1024, 1024),
    nn.LayerNorm(1024),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(1024, 1024),
    nn.LayerNorm(1024),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(1024, 1536)
).to(device)

f_it = nn.Sequential(
    nn.Linear(1536, 1024),
    nn.LayerNorm(1024),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(1024, 1024),
    nn.LayerNorm(1024),
    nn.GELU(),
    nn.Dropout(0.1),
    nn.Linear(1024, 1024)
).to(device)

opt = torch.optim.AdamW(list(f_ti.parameters()) + list(f_it.parameters()), lr=3e-4)

# --- Training data loader ---
loader = DataLoader(
    torch.utils.data.TensorDataset(tx_c, im_c),
    batch_size=1024, shuffle=True, drop_last=True
)

tau = 0.07
epoch = 50
for ep in range(epoch):
    for t_b, i_b in loader:
        t_b, i_b = t_b.to(device), i_b.to(device)
        t2i = F.normalize(f_ti(t_b), p=2, dim=1)
        i2t = F.normalize(f_it(i_b), p=2, dim=1)

        # CLIP-style symmetric contrastive loss
        L_clip = (
            F.cross_entropy((t2i @ i_b.T) / tau, torch.arange(t_b.size(0), device=device)) +
            F.cross_entropy((i2t @ t_b.T) / tau, torch.arange(t_b.size(0), device=device))
        ) / 2

        # Cycle consistency (stability)
        L_cyc = (
            ((F.normalize(f_it(t2i), p=2, dim=1) - t_b)**2).mean() +
            ((F.normalize(f_ti(i2t), p=2, dim=1) - i_b)**2).mean()
        )

        loss = L_clip + 0.1 * L_cyc
        opt.zero_grad(); loss.backward(); opt.step()

    print(f"Epoch {ep+1}/{epoch} | Loss: {loss.item():.4f}")

Using device: cuda
Epoch 1/100 | Loss: 3.2586
Epoch 2/100 | Loss: 2.6308
Epoch 3/100 | Loss: 2.2705
Epoch 4/100 | Loss: 2.0498
Epoch 5/100 | Loss: 1.8912
Epoch 6/100 | Loss: 1.7731
Epoch 7/100 | Loss: 1.6175
Epoch 8/100 | Loss: 1.5381
Epoch 9/100 | Loss: 1.4557
Epoch 10/100 | Loss: 1.3716
Epoch 11/100 | Loss: 1.3191
Epoch 12/100 | Loss: 1.2391
Epoch 13/100 | Loss: 1.1889
Epoch 14/100 | Loss: 1.1138
Epoch 15/100 | Loss: 1.0827
Epoch 16/100 | Loss: 1.0357
Epoch 17/100 | Loss: 0.9929
Epoch 18/100 | Loss: 0.9346
Epoch 19/100 | Loss: 0.9381
Epoch 20/100 | Loss: 0.8852
Epoch 21/100 | Loss: 0.8610
Epoch 22/100 | Loss: 0.8329
Epoch 23/100 | Loss: 0.8036
Epoch 24/100 | Loss: 0.7978
Epoch 25/100 | Loss: 0.7424
Epoch 26/100 | Loss: 0.7166
Epoch 27/100 | Loss: 0.7359
Epoch 28/100 | Loss: 0.6921
Epoch 29/100 | Loss: 0.6817
Epoch 30/100 | Loss: 0.6452
Epoch 31/100 | Loss: 0.6452
Epoch 32/100 | Loss: 0.6175
Epoch 33/100 | Loss: 0.6136
Epoch 34/100 | Loss: 0.5826
Epoch 35/100 | Loss: 0.5767
Epoch 36/1

In [None]:
# ======================================================
#   Final Evaluation with Logit Scale Tuning
# ======================================================
import torch, torch.nn.functional as F

# --- Step 1: Normalize and map test captions ---
tx_test_t = torch.as_tensor(tx_test, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)

tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
im_base = F.normalize(im_train_t, p=2, dim=1)

with torch.no_grad():
    mapped = F.normalize(f_ti(tx_test_n), p=2, dim=1)

# --- Step 2: Define similarity matrix as a function of scale ---
def compute_logits(logit_scale):
    return (mapped @ im_base.T) * torch.exp(logit_scale)

# --- Step 3: Optimize the logit scale (temperature) ---
logit_scale = torch.nn.Parameter(torch.tensor(0.0, device=device))  # log(1.0)
optimizer = torch.optim.LBFGS([logit_scale], max_iter=50)
targets = torch.arange(mapped.size(0), device=device) // 5

def closure():
    optimizer.zero_grad()
    logits = compute_logits(logit_scale)
    loss = F.cross_entropy(logits, targets)
    loss.backward()
    return loss

optimizer.step(closure)
print("Fitted logit scale:", float(torch.exp(logit_scale)))

# --- Step 4: Compute scaled similarities ---
sims = compute_logits(logit_scale)

# --- Step 5: Recall@K ---
def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

print("\nCycle + Temp Recall:")
for K in [1, 5, 10, 50]:
    print(f"Recall@{K}: {recall_at_k(sims, K):.4f}")


Fitted logit scale: 0.00015294417971745133

Cycle + Temp Recall:
Recall@1: 0.0000
Recall@5: 0.0013
Recall@10: 0.0013
Recall@50: 0.0113


In [None]:
# sims = final similarity matrix (e.g., from deep model or ensemble)
topk = sims.topk(10, dim=1).indices.cpu().numpy()  # (1500, 10)
image_ids = topk // 5                              # map back to image indices

# Write to file
import pandas as pd
df = pd.DataFrame(image_ids, columns=[f"rank_{i}" for i in range(10)])
df.to_csv("submission.csv", index=False)
print("✅ submission.csv generated.")


In [None]:
# sims = final similarity matrix (e.g., from deep model or ensemble)
topk = sims.topk(10, dim=1).indices.cpu().numpy()  # (1500, 10)
image_ids = topk // 5                              # map back to image indices

# Write to file
import pandas as pd
df = pd.DataFrame(image_ids, columns=[f"rank_{i}" for i in range(10)])
df.to_csv("submissionUpdated.csv", index=False)
print("✅ submissionUpdated.csv generated.")


✅ submissionUpdated.csv generated.


# EXPERIMENT 5F — Preprocessing Fix + Procrustes & WCT

In [None]:
# ======================================================
#   EXPERIMENT 5F — Preprocessing Fix + Procrustes & WCT
# ======================================================

import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# --- Load and preprocess embeddings ---
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# --- Preprocessing Fixes: Normalize and Center ---
tx_train_t = F.normalize(tx_train_t - tx_train_t.mean(0, keepdim=True), p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_train_t.mean(0, keepdim=True), p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_train_t.mean(0, keepdim=True), p=2, dim=1)

# --- Build per-image centroids (average 5 captions per image) ---
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)

# --- Procrustes Alignment ---
M = im_train_t.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh  # (1536 x 1024)

mapped_proc = F.normalize(tx_test_t @ R.T, p=2, dim=1)
sims_proc = mapped_proc @ im_train_t.T

# --- Rectangular WCT Alignment ---
mu_tx = tx_centroids.mean(0, keepdim=True)
mu_im = im_train_t.mean(0, keepdim=True)
X = tx_centroids - mu_tx
Y = im_train_t - mu_im

M = Y.T @ X / X.shape[0]
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
S_clipped = torch.clamp(S, min=1e-5)
W = (U * S_clipped) @ Vh
b = mu_im - mu_tx @ W.T

mapped_wct = F.normalize((tx_test_t - mu_tx) @ W.T + b, p=2, dim=1)
sims_wct = mapped_wct @ im_train_t.T

# --- Recall@K Function ---
def recall_at_k(sims, K=10):
    topk = sims.topk(K, dim=1).indices
    true_idx = torch.arange(sims.size(0), device=device) // 5
    preds = topk // 5
    return (preds == true_idx.unsqueeze(1)).any(dim=1).float().mean().item()

# --- Report Results ---
print("\n5F Results — After Proper Preprocessing")
print("Procrustes Recall:")
for K in [1, 5, 10, 50]:
    print(f"Recall@{K}: {recall_at_k(sims_proc, K):.4f}")

print("\nRectangular WCT Recall:")
for K in [1, 5, 10, 50]:
    print(f"Recall@{K}: {recall_at_k(sims_wct, K):.4f}")

Using device: cuda

5F Results — After Proper Preprocessing
Procrustes Recall:
Recall@1: 0.0000
Recall@5: 0.0007
Recall@10: 0.0013
Recall@50: 0.0113

Rectangular WCT Recall:
Recall@1: 0.0000
Recall@5: 0.0000
Recall@10: 0.0000
Recall@50: 0.0053


In [None]:
import pandas as pd
import numpy as np

# 1. Translate test embeddings
tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
with torch.no_grad():
    preds = f_ti(tx_test_n).cpu().numpy()  # shape: (1500, 1536)

# 2. Load actual test IDs
test_ids = test_data["captions/ids"]  # shape (1500,)

# 3. Ensure shape alignment
assert preds.shape[0] == len(test_ids), "Mismatch between predictions and test IDs"

# 4. Build submission DataFrame
submission = pd.DataFrame({
    "id": test_ids.astype(int),
    "embedding": [list(map(float, row)) for row in preds]  # plain Python-style list
})

# 5. Save CSV — no quotes around list
submission.to_csv("submissionFinal.csv", index=False)

print("✅ submissionFinal.csv saved — with plain list format, no quotes.")
submission.head(3)


✅ submissionFinal.csv saved — with plain list format, no quotes.


Unnamed: 0,id,embedding
0,1,"[0.10622628033161163, 0.23988360166549683, 0.2..."
1,2,"[-0.4222858250141144, -0.37915998697280884, -0..."
2,3,"[0.09091868996620178, -0.5292689204216003, 0.1..."
