In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel
import timm  
from PIL import Image
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from tqdm import tqdm
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

For this project Effecientnet_b0 has been used as it is light-weight.

In [49]:
from torchvision.models import efficientnet_b0

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


Data Loading 

In [51]:
train_df = pd.read_csv("D:\\Amazon ML\\68e8d1d70b66d_student_resource\\student_resource\\dataset\\train.csv")
test_df = pd.read_csv("D:\\Amazon ML\\test1.csv")

assert all(col in train_df.columns for col in ["sample_id", "catalog_content", "image_link", "price"])
assert all(col in test_df.columns for col in ["sample_id", "catalog_content", "image_link", "price"])

train_df["price_per_unit"] = train_df["price"] / (train_df["catalog_content"].str.len() + 1)
test_df["price_per_unit"] = test_df["price"] / (test_df["catalog_content"].str.len() + 1)

train_df["text_length"] = train_df["catalog_content"].apply(lambda x: len(str(x)))
test_df["text_length"] = test_df["catalog_content"].apply(lambda x: len(str(x)))

feature_cols = ["price_per_unit", "text_length"]

In [None]:
for col in feature_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

train_df[feature_cols] = train_df[feature_cols].fillna(0)
test_df[feature_cols] = test_df[feature_cols].fillna(0)

# Split 
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Scale features
scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
val_df[feature_cols] = scaler.transform(val_df[feature_cols])
test_df[feature_cols] = scaler.transform(test_df[feature_cols])

# Force numeric dtype explicitly (fix for np.object_ error)
for df in [train_df, val_df, test_df]:
    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce').astype(np.float32)

In [53]:
print(train_df[feature_cols].dtypes)
print(test_df[feature_cols].dtypes)

price_per_unit    float32
text_length       float32
dtype: object
price_per_unit    float32
text_length       float32
dtype: object


In [54]:
print(f"Test DataFrame size (rows): {len(test_df)}")

Test DataFrame size (rows): 100


Transforming Images according to the ImageNet specifications and using reberta-base as a tokenizer.

In [55]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
class TriModalDataset(Dataset):
    def __init__(self, dataframe, transform=None, feature_cols=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform
        self.feature_cols = feature_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_url = row["image_link"]
        text = row["catalog_content"]

        #  Safely handle numeric features
        feats_np = row[self.feature_cols].astype(np.float32).values
        features = torch.tensor(feats_np, dtype=torch.float32)
        price = torch.tensor(float(row["price"]), dtype=torch.float32)

        # Image 
        try:
            response = requests.get(img_url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert("RGB")
        except:
            img = Image.new("RGB", (224, 224), color="white")
        if self.transform:
            img = self.transform(img)

        # Text
        encoding = tokenizer(
            text, truncation=True, padding='max_length',
            max_length=128, return_tensors='pt'
        )

        return img, encoding, features, price

In [57]:
train_dataset = TriModalDataset(train_df, image_transform, feature_cols)
val_dataset = TriModalDataset(val_df, image_transform, feature_cols)
test_dataset = TriModalDataset(test_df, image_transform, feature_cols)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
class TriModalPricePredictor(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        # Image branch
        self.img_model = efficientnet_b0(pretrained=True)
        for param in self.img_model.parameters():
            param.requires_grad = False
        num_ftrs = self.img_model.classifier[1].in_features
        self.img_model.classifier = nn.Identity()

        # Text branch
        self.text_model = AutoModel.from_pretrained("roberta-base")
        for param in self.text_model.parameters():
            param.requires_grad = False
        self.text_fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Feature branch
        self.feature_fc = nn.Sequential(
            nn.Linear(feature_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Combined head
        self.fc = nn.Sequential(
            nn.Linear(num_ftrs + 256 + 64, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1)
        )

    def forward(self, img, text_inputs, feats):
        img_feat = self.img_model(img)
        text_out = self.text_model(**{k: v.squeeze(1).to(device) for k, v in text_inputs.items()})
        text_feat = text_out.last_hidden_state.mean(dim=1)
        text_feat = self.text_fc(text_feat)
        feat_feat = self.feature_fc(feats)
        combined = torch.cat((img_feat, text_feat, feat_feat), dim=1)
        return self.fc(combined)


In [59]:
model = TriModalPricePredictor(feature_dim=len(feature_cols)).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-4)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for imgs, text_enc, feats, prices in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        imgs, feats, prices = imgs.to(device), feats.to(device), prices.to(device).unsqueeze(1)
        optimizer.zero_grad()
        outputs = model(imgs, text_enc, feats)
        loss = criterion(outputs, prices)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch [{epoch+1}/{EPOCHS}] | Train Loss: {train_loss/len(train_loader):.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for imgs, text_enc, feats, prices in val_loader:
            imgs, feats, prices = imgs.to(device), feats.to(device), prices.to(device).unsqueeze(1)
            outputs = model(imgs, text_enc, feats)
            loss = criterion(outputs, prices)
            val_loss += loss.item()
    print(f"Epoch [{epoch+1}/{EPOCHS}] | Val Loss: {val_loss/len(val_loader):.4f}\n")

Saving the model and the weights of the model in .pth format.

In [None]:
torch.save(model.state_dict(), "tri_modal_price_predictor.pth")
print("Model training complete and saved as 'tri_modal_price_predictor.pth'")