In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from transformers import AutoTokenizer, AutoModel
from PIL import Image
import pandas as pd
import requests
from io import BytesIO
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


DATA LOADING

In [3]:
df = pd.read_csv("D:\Amazon ML\\68e8d1d70b66d_student_resource\\student_resource\\dataset\\train.csv")
print(df.head())

assert all(col in df.columns for col in ["sample_id", "catalog_content", "image_link", "price"])

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

  df = pd.read_csv("D:\Amazon ML\\68e8d1d70b66d_student_resource\\student_resource\\dataset\\train.csv")


   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  


Transforming Images according to the ImageNet specifications and using distilbert as a tokenizer.

In [None]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_model = AutoModel.from_pretrained("distilbert-base-uncased")

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_url = row["image_link"]
        text = row["catalog_content"]
        price = torch.tensor(row["price"], dtype=torch.float32)

        # Image
        try:
            response = requests.get(img_url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert("RGB")
        except:
            img = Image.new("RGB", (224, 224), color="white")
        if self.transform:
            img = self.transform(img)

        # Text
        encoding = tokenizer(text, truncation=True, padding='max_length',
                             max_length=128, return_tensors='pt')
        return img, encoding, price

In [6]:
train_dataset = MultimodalDataset(train_df, transform=image_transform)
val_dataset = MultimodalDataset(val_df, transform=image_transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
class MultimodalRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        # Image branch 
        self.img_model = models.resnet18(pretrained=True)
        for param in self.img_model.parameters():
            param.requires_grad = False
        num_ftrs = self.img_model.fc.in_features
        self.img_model.fc = nn.Identity() 

        # Text branch 
        self.text_model = text_model
        for param in self.text_model.parameters():
            param.requires_grad = False
        self.text_fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Combined head 
        self.fc = nn.Sequential(
            nn.Linear(num_ftrs + 256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1)
        )

    def forward(self, img, text_inputs):
        # Image features
        img_feat = self.img_model(img)

        # Text features
        outputs = self.text_model(**{k: v.squeeze(1).to(device) for k, v in text_inputs.items()})
        text_feat = outputs.last_hidden_state.mean(dim=1)
        text_feat = self.text_fc(text_feat)

        # Concatenate
        combined = torch.cat((img_feat, text_feat), dim=1)
        price = self.fc(combined)
        return price

In [8]:
model = MultimodalRegressor().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-4)



In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for imgs, text_enc, prices in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        imgs, prices = imgs.to(device), prices.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(imgs, text_enc)
        loss = criterion(outputs, prices)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] | Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for imgs, text_enc, prices in val_loader:
            imgs, prices = imgs.to(device), prices.to(device).unsqueeze(1)
            outputs = model(imgs, text_enc)
            loss = criterion(outputs, prices)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] | Validation Loss: {avg_val_loss:.4f}\n")

Epoch 1/5: 100%|██████████| 8438/8438 [2:17:07<00:00,  1.03it/s]   


Epoch [1/5] | Train Loss: 1126.3453
Epoch [1/5] | Validation Loss: 1029.4116



Epoch 2/5: 100%|██████████| 8438/8438 [1:41:24<00:00,  1.39it/s]


Epoch [2/5] | Train Loss: 1118.0524
Epoch [2/5] | Validation Loss: 1028.4057



Epoch 3/5: 100%|██████████| 8438/8438 [1:41:17<00:00,  1.39it/s]


Epoch [3/5] | Train Loss: 1112.7200
Epoch [3/5] | Validation Loss: 1015.9745



Epoch 4/5: 100%|██████████| 8438/8438 [1:40:46<00:00,  1.40it/s]


Epoch [4/5] | Train Loss: 1105.0151
Epoch [4/5] | Validation Loss: 1008.9846



Epoch 5/5: 100%|██████████| 8438/8438 [2:17:44<00:00,  1.02it/s]  


Epoch [5/5] | Train Loss: 1098.6335
Epoch [5/5] | Validation Loss: 1766.7521



Saving the model and the weights of the model in .pth format.

In [None]:
torch.save(model.state_dict(), "multimodal_price_predictor.pth")
print(" Model training complete and saved as 'multimodal_price_predictor.pth'")