In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import os

BASE_DIR = "/content/drive/MyDrive/AML Challenge"
os.chdir(BASE_DIR)
print("Current working directory:", os.getcwd())


Current working directory: /content/drive/MyDrive/AML Challenge


# Pre-Processing

In [5]:
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np, pandas as pd
# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]   # (125000, 1024)
im_train = train_data["images/embeddings"]     # (25000, 1536)
tx_test  = test_data["captions/embeddings"]    # (1500, 1024)

print("Train shapes:", tx_train.shape, im_train.shape)
print("Test shape:", tx_test.shape)

# ------------------------------------------------------
# 2. Match each caption to its corresponding image
# ------------------------------------------------------
repeat_factor = len(tx_train) // len(im_train)   # 5 captions per image
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

Using device: cuda
Train shapes: (125000, 1024) (25000, 1536)
Test shape: (1500, 1024)


# Experiment 1: got score as 0.81780
# Residual-Orthogonal + Contrastive version

In [7]:
# ------------------------------------------------------
# 2. Match each caption to its corresponding image
# ------------------------------------------------------
repeat_factor = len(tx_train) // len(im_train)   # 5 captions per image
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

# ------------------------------------------------------
# 3. Convert to tensors + center + normalize
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

# Center + normalize (same mean for test)
tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t.mean(0, keepdim=True)

tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_mean, p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_mean, p=2, dim=1)

# Expand image embeddings for each caption
im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

# ------------------------------------------------------
# 4. Compute Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh  # (1536 × 1024)

print("Computed orthogonal base R:", R.shape)

# ------------------------------------------------------
# 5. Define Residual-Orthogonal Translator
# ------------------------------------------------------
class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        out = F.normalize(base + res, p=2, dim=1)
        return out

# ------------------------------------------------------
# 6. Define losses
# ------------------------------------------------------
def contrastive_loss(pred, target, tau=0.07):
    sims = pred @ target.T / tau
    labels = torch.arange(pred.size(0), device=device)
    return F.cross_entropy(sims, labels)

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.2
)

# ------------------------------------------------------
# 7. DataLoader
# ------------------------------------------------------
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 8. Initialize model + optimizer
# ------------------------------------------------------
model = ResidualTranslator(R.detach().clone()).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

# ------------------------------------------------------
# 9. Training loop
# ------------------------------------------------------
EPOCHS = 30
print("\nTraining Residual-Orthogonal Translator...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        y_pred = model(x_batch)
        idx = torch.randperm(x_batch.size(0), device=device)
        y_neg = y_batch[idx]

        loss_cos = contrastive_loss(y_pred, y_batch)
        loss_tri = triplet_loss_fn(y_pred, y_batch, y_neg)
        loss = 0.7 * loss_cos + 0.3 * loss_tri

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch:02d}: avg loss = {avg_loss:.4f}")

torch.save(model.state_dict(), "residual_orthogonal.pth")
print("\n✅ Training completed and model saved as residual_orthogonal.pth")

# ------------------------------------------------------
# 10. Inference for submission
# ------------------------------------------------------
model.eval()
with torch.no_grad():
    tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
    preds = model(tx_test_n).cpu().numpy()

# ------------------------------------------------------
# 11. Save submission file
# ------------------------------------------------------
test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_residual.csv", index=False)

print("✅ Saved submission_residual.csv")
print(submission.head(3))

Computed orthogonal base R: torch.Size([1536, 1024])

Training Residual-Orthogonal Translator...





Epoch 01: avg loss = 1.7665




Epoch 02: avg loss = 1.6145




Epoch 03: avg loss = 1.5555




Epoch 04: avg loss = 1.5218




Epoch 05: avg loss = 1.4972




Epoch 06: avg loss = 1.4781




Epoch 07: avg loss = 1.4637




Epoch 08: avg loss = 1.4508




Epoch 09: avg loss = 1.4397




Epoch 10: avg loss = 1.4303




Epoch 11: avg loss = 1.4221




Epoch 12: avg loss = 1.4162




Epoch 13: avg loss = 1.4093




Epoch 14: avg loss = 1.4035




Epoch 15: avg loss = 1.3994




Epoch 16: avg loss = 1.3924




Epoch 17: avg loss = 1.3882




Epoch 18: avg loss = 1.3842




Epoch 19: avg loss = 1.3798




Epoch 20: avg loss = 1.3767




Epoch 21: avg loss = 1.3730




Epoch 22: avg loss = 1.3689




Epoch 23: avg loss = 1.3669




Epoch 24: avg loss = 1.3636




Epoch 25: avg loss = 1.3608




Epoch 26: avg loss = 1.3575




Epoch 27: avg loss = 1.3553




Epoch 28: avg loss = 1.3527




Epoch 29: avg loss = 1.3505




Epoch 30: avg loss = 1.3500

✅ Training completed and model saved as residual_orthogonal.pth
✅ Saved submission_residual.csv
   id                                          embedding
0   1  [-0.002457899274304509, 0.01370843406766653, 0...
1   2  [-0.03886483237147331, -0.031252775341272354, ...
2   3  [-0.006630855146795511, -0.019226159900426865,...


# Experiment 2: got Score as: 0.73571

# Residual-Orthogonal + Contrastive version
# Deep Residual Translator (ResMLP-style)

In [8]:
# ======================================================
#  EXPERIMENT 7 — Deep Residual Translator (ResMLP-style)
# ======================================================

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np, pandas as pd

# ------------------------------------------------------
# 1. Device and data loading
# ------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")

tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

print("Train shapes:", tx_train.shape, im_train.shape, "Test:", tx_test.shape)

# ------------------------------------------------------
# 2. Convert to tensors + center + normalize
# ------------------------------------------------------
tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)

tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t.mean(0, keepdim=True)

tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_mean, p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_mean, p=2, dim=1)

im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

# ------------------------------------------------------
# 3. Compute Orthogonal Procrustes base (R)
# ------------------------------------------------------
tx_centroids = tx_train_t.view(-1, 5, tx_train_t.shape[1]).mean(dim=1)
M = im_train_t.T @ tx_centroids
U, S, Vh = torch.linalg.svd(M, full_matrices=False)
R = U @ Vh
print("Computed orthogonal base R:", R.shape)

# ------------------------------------------------------
# 4. Residual building blocks (ResMLP-style)
# ------------------------------------------------------
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(dim, dim)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        out = self.fc2(self.act(self.fc1(self.norm(x))))
        return x + self.drop(out)

# ------------------------------------------------------
# 5. Deep Residual Translator
# ------------------------------------------------------
class DeepResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536, depth=3):
        super().__init__()
        self.R = nn.Parameter(R_init.clone(), requires_grad=False)   # frozen at start
        blocks = []
        for _ in range(depth):
            blocks.append(ResidualBlock(hidden_dim))
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            *blocks,
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        return F.normalize(base + res, p=2, dim=1)

# ------------------------------------------------------
# 6. Loss functions
# ------------------------------------------------------
def contrastive_loss(pred, target, tau=0.07):
    sims = pred @ target.T / tau
    labels = torch.arange(pred.size(0), device=device)
    return F.cross_entropy(sims, labels)

triplet_loss_fn = nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y),
    margin=0.25
)

# ------------------------------------------------------
# 7. Dataloader
# ------------------------------------------------------
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 8. Model + optimizer
# ------------------------------------------------------
model = DeepResidualTranslator(R, depth=3).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40)

# ------------------------------------------------------
# 9. Training loop (2-phase: frozen R then fine-tune)
# ------------------------------------------------------
EPOCHS = 40
print("\nTraining Deep Residual Translator...\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        y_pred = model(x_batch)
        idx = torch.randperm(x_batch.size(0), device=device)
        y_neg = y_batch[idx]

        loss_cos = contrastive_loss(y_pred, y_batch, tau=0.05)
        loss_tri = triplet_loss_fn(y_pred, y_batch, y_neg)
        loss = 0.7 * loss_cos + 0.3 * loss_tri

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch:02d}: avg loss = {avg_loss:.4f}")

    # unfreeze R after 20 epochs
    if epoch == 20:
        model.R.requires_grad = True
        print("→ Unfroze R for fine-tuning.")

torch.save(model.state_dict(), "deep_resmlp_translator.pth")
print("\n✅ Training completed and model saved as deep_resmlp_translator.pth")

# ------------------------------------------------------
# 10. Inference and submission
# ------------------------------------------------------
model.eval()
with torch.no_grad():
    tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
    preds = model(tx_test_n).cpu().numpy()

test_ids = test_data["captions/ids"].astype(int)
submission = pd.DataFrame({
    "id": test_ids,
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_ResMLP.csv", index=False)
print("✅ Saved submission_ResMLP.csv")
print(submission.head(3))


Using device: cuda
Train shapes: (125000, 1024) (25000, 1536) Test: (1500, 1024)
Computed orthogonal base R: torch.Size([1536, 1024])

Training Deep Residual Translator...





Epoch 01: avg loss = 1.8413




Epoch 02: avg loss = 1.3359




Epoch 03: avg loss = 1.1679




Epoch 04: avg loss = 1.0494




Epoch 05: avg loss = 0.9493




Epoch 06: avg loss = 0.8607




Epoch 07: avg loss = 0.7788




Epoch 08: avg loss = 0.7037




Epoch 09: avg loss = 0.6339




Epoch 10: avg loss = 0.5697




Epoch 11: avg loss = 0.5124




Epoch 12: avg loss = 0.4607




Epoch 13: avg loss = 0.4200




Epoch 14: avg loss = 0.3812




Epoch 15: avg loss = 0.3486




Epoch 16: avg loss = 0.3203




Epoch 17: avg loss = 0.2970




Epoch 18: avg loss = 0.2745




Epoch 19: avg loss = 0.2578




Epoch 20: avg loss = 0.2416
→ Unfroze R for fine-tuning.




Epoch 21: avg loss = 0.2270




Epoch 22: avg loss = 0.2155




Epoch 23: avg loss = 0.2036




Epoch 24: avg loss = 0.1944




Epoch 25: avg loss = 0.1862




Epoch 26: avg loss = 0.1787




Epoch 27: avg loss = 0.1713




Epoch 28: avg loss = 0.1654




Epoch 29: avg loss = 0.1596




Epoch 30: avg loss = 0.1551




Epoch 31: avg loss = 0.1512




Epoch 32: avg loss = 0.1468




Epoch 33: avg loss = 0.1447




Epoch 34: avg loss = 0.1426




Epoch 35: avg loss = 0.1392




Epoch 36: avg loss = 0.1386




Epoch 37: avg loss = 0.1378




Epoch 38: avg loss = 0.1359




Epoch 39: avg loss = 0.1354




Epoch 40: avg loss = 0.1360

✅ Training completed and model saved as deep_resmlp_translator.pth
✅ Saved submission_ResMLP.csv
   id                                          embedding
0   1  [0.01420063991099596, -0.005876442883163691, 0...
1   2  [0.00700219115242362, 7.112888852134347e-05, 0...
2   3  [0.006470625754445791, -0.02336675114929676, 0...


# Experiment 3: got score as 0
# Structure-Aware Contrastive Fine-Tune

In [22]:
# ======================================================
#  EXPERIMENT 9 — Structure-Aware Residual-Orthogonal Translator
# ======================================================

import torch, torch.nn.functional as F
from tqdm import tqdm
import numpy as np, pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ------------------------------------------------------
# 1. Load data
# ------------------------------------------------------
train_data = np.load("train.npz")
test_data  = np.load("test.clean.npz")
tx_train = train_data["captions/embeddings"]
im_train = train_data["images/embeddings"]
tx_test  = test_data["captions/embeddings"]

repeat_factor = len(tx_train) // len(im_train)
im_train_expanded = np.repeat(im_train, repeat_factor, axis=0)

tx_train_t = torch.as_tensor(tx_train, dtype=torch.float32, device=device)
im_train_t = torch.as_tensor(im_train, dtype=torch.float32, device=device)
tx_test_t  = torch.as_tensor(tx_test,  dtype=torch.float32, device=device)
im_train_exp = torch.as_tensor(im_train_expanded, dtype=torch.float32, device=device)

# normalize
tx_mean = tx_train_t.mean(0, keepdim=True)
im_mean = im_train_t.mean(0, keepdim=True)
tx_train_t = F.normalize(tx_train_t - tx_mean, p=2, dim=1)
im_train_t = F.normalize(im_train_t - im_mean, p=2, dim=1)
tx_test_t  = F.normalize(tx_test_t  - tx_mean, p=2, dim=1)
im_train_exp = F.normalize(im_train_exp - im_mean, p=2, dim=1)

# ------------------------------------------------------
# 2. Reload your previous trained model
# ------------------------------------------------------
model = ResidualTranslator(torch.zeros(1536,1024)).to(device)
model.load_state_dict(torch.load("residual_orthogonal.pth"))
print("Loaded previous 0.8180 checkpoint.")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-5)


class ResidualTranslator(nn.Module):
    def __init__(self, R_init, input_dim=1024, hidden_dim=1024, output_dim=1536):
        super().__init__()
        self.register_buffer("R", R_init)
        self.residual = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        base = x @ self.R.T
        res = self.residual(x)
        out = F.normalize(base + res, p=2, dim=1)
        return out


# ------------------------------------------------------
# 3. New structure-aware losses
# ------------------------------------------------------
def contrastive_loss(pred, target, tau=0.07):
    sims = pred @ target.T / tau
    labels = torch.arange(pred.size(0), device=device)
    return F.cross_entropy(sims, labels)

triplet_loss_fn = torch.nn.TripletMarginWithDistanceLoss(
    distance_function=lambda x, y: 1 - F.cosine_similarity(x, y), margin=0.2
)

def knn_preservation_loss(src, mapped, k=10):
    # preserve relative neighbors
    with torch.no_grad():
        sims_src = src @ src.T
        _, nn_src = sims_src.topk(k, dim=1)
    sims_map = mapped @ mapped.T
    gathered = sims_map.gather(1, nn_src)
    return 1 - gathered.mean()  # smaller is better

def caption_consistency_loss(pred, group=5):
    # Ensure divisible by 5
    bsz = pred.shape[0] - (pred.shape[0] % group)
    if bsz == 0:
        return torch.tensor(0.0, device=pred.device)
    pred = pred[:bsz]
    pred_g = pred.view(-1, group, pred.shape[1])
    centroid = pred_g.mean(1, keepdim=True)
    return (1 - F.cosine_similarity(pred_g, centroid, dim=2)).mean()

# ------------------------------------------------------
# 4. DataLoader
# ------------------------------------------------------
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(tx_train_t, im_train_exp)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=0)

# ------------------------------------------------------
# 5. Fine-tune 10–15 epochs
# ------------------------------------------------------
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        y_pred = model(x_batch)
        idx = torch.randperm(x_batch.size(0), device=device)
        y_neg = y_batch[idx]

        loss_cos = contrastive_loss(y_pred, y_batch)
        loss_tri = triplet_loss_fn(y_pred, y_batch, y_neg)
        loss_knn = knn_preservation_loss(x_batch, y_pred)
        loss_cap = caption_consistency_loss(y_pred)

        loss = 0.6*loss_cos + 0.2*loss_tri + 0.1*loss_knn + 0.1*loss_cap
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch:02d}: avg loss = {total_loss/len(train_loader):.4f}")

torch.save(model.state_dict(), "residual_structureAware.pth")
print("✅ Saved fine-tuned structure-aware model.")

# ------------------------------------------------------
# 6. Inference with CSLS re-scoring
# ------------------------------------------------------
def csls_similarity(x, y, k=10):
    x_norm = F.normalize(x, p=2, dim=1)
    y_norm = F.normalize(y, p=2, dim=1)
    sim = x_norm @ y_norm.T
    r_x = sim.topk(k, dim=1).values.mean(1, keepdim=True)
    r_y = sim.topk(k, dim=0).values.mean(0, keepdim=True)
    csls = 2 * sim - r_x - r_y
    return csls

model.eval()
with torch.no_grad():
    tx_test_n = F.normalize(tx_test_t, p=2, dim=1)
    pred_t = model(tx_test_n)
    im_train_n = F.normalize(im_train_t, p=2, dim=1)
    csls_sims = csls_similarity(pred_t, im_train_n)

# (optional local recall check)
top1 = csls_sims.topk(1, dim=1).indices
print("Top-1 indices sample:", top1[:10].T)

# save final embeddings for Kaggle submission
preds = pred_t.cpu().numpy()
submission = pd.DataFrame({
    "id": test_data["captions/ids"].astype(int),
    "embedding": [list(map(float, row)) for row in preds]
})
submission.to_csv("submission_structureAware.csv", index=False)
print("✅ Saved submission_structureAware.csv")


Using device: cuda
Loaded previous 0.8180 checkpoint.




Epoch 01: avg loss = 1.2627




Epoch 02: avg loss = 1.2615




Epoch 03: avg loss = 1.2604




Epoch 04: avg loss = 1.2595




Epoch 05: avg loss = 1.2589




Epoch 06: avg loss = 1.2584




Epoch 07: avg loss = 1.2577




Epoch 08: avg loss = 1.2570




Epoch 09: avg loss = 1.2555




Epoch 10: avg loss = 1.2549




Epoch 11: avg loss = 1.2540




Epoch 12: avg loss = 1.2543




Epoch 13: avg loss = 1.2530




Epoch 14: avg loss = 1.2523




Epoch 15: avg loss = 1.2515




Epoch 16: avg loss = 1.2516




Epoch 17: avg loss = 1.2507




Epoch 18: avg loss = 1.2490




Epoch 19: avg loss = 1.2499




Epoch 20: avg loss = 1.2488
✅ Saved fine-tuned structure-aware model.
Top-1 indices sample: tensor([[ 7337,  2192,  1924, 17138, 19291, 13162,  8291,  9711, 16323, 19278]],
       device='cuda:0')
✅ Saved submission_structureAware.csv
