In [1]:
import os
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm
from urllib.parse import urlparse
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from torch.optim import Adam

In [3]:
data = pd.read_csv('/content/train_amazon_ml.csv')

In [4]:
data

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [5]:
def download_images(image_links, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for i, link in enumerate(image_links):
        try:
            response = requests.get(link)
            response.raise_for_status()
            parsed_url = urlparse(link)
            image_name = os.path.basename(parsed_url.path)
            image_path = os.path.join(output_dir, image_name)
            image = Image.open(BytesIO(response.content))
            image.save(image_path)
            print(f"Downloaded and saved image: {image_path}")
        except Exception as e:
            print(f"Error downloading or saving image from {link}: {e}")

In [6]:
unique_values = OrderedDict()
b = []
for index, value in enumerate(data['entity_name']):
    if value not in unique_values:
        unique_values[value] = index
        b.append(index)

In [7]:
b

[0, 1, 25, 36, 197, 536, 670, 3205]

In [8]:
d_image1, group_id2, entity_name2, entity_value2 = [], [], [], []
for i in range(0, len(b)):
    d_image1.append(data['image_link'][b[i]])
    group_id2.append(data['group_id'][b[i]])
    entity_name2.append(data['entity_name'][b[i]])
    entity_value2.append(data['entity_value'][b[i]])

In [9]:
df = pd.DataFrame({'image_link': d_image1, 'group_id': group_id2, 'entity_name': entity_name2, 'entity_value': entity_value2})

In [12]:
df

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/71nywfWZUw...,179080,voltage,48.0 volt
3,https://m.media-amazon.com/images/I/61o2ntPNNg...,179080,wattage,800.0 watt
4,https://m.media-amazon.com/images/I/71d+dz7ogk...,150913,maximum_weight_recommendation,15 kilogram
5,https://m.media-amazon.com/images/I/610bLFQIS3...,442321,height,95.0 centimetre
6,https://m.media-amazon.com/images/I/51k7GMS8dg...,630869,depth,21.0 centimetre
7,https://m.media-amazon.com/images/I/51GzV0nG31...,675317,width,22.0 millimetre


In [10]:
df.to_csv('unique1.csv', index=False)

In [11]:
download_images(df['image_link'], 'train_images')

Downloaded and saved image: train_images/61I9XdN6OFL.jpg
Downloaded and saved image: train_images/71gSRbyXmoL.jpg
Downloaded and saved image: train_images/71nywfWZUwL.jpg
Downloaded and saved image: train_images/61o2ntPNNgL.jpg
Downloaded and saved image: train_images/71d+dz7ogkL.jpg
Downloaded and saved image: train_images/610bLFQIS3L.jpg
Downloaded and saved image: train_images/51k7GMS8dgL.jpg
Downloaded and saved image: train_images/51GzV0nG31L.jpg


In [14]:
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_mapping = {name: idx for idx, name in enumerate(self.data['entity_name'].unique())}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.basename(self.data.iloc[idx, 0])
        img_path = os.path.join(self.root_dir, img_name)
        if not os.path.isfile(img_path):
            print(f"File {img_path} not found.")
            return None, None

        image = Image.open(img_path).convert('RGB')
        label = self.data.iloc[idx, 2]
        label = self.label_mapping[label]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)

In [15]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [16]:
def fine_tune_resnet(model):
    for param in model.cnn_model.parameters():
        param.requires_grad = False
    for param in model.cnn_model.layer4.parameters():
        param.requires_grad = True

In [17]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x

In [18]:
class Attention(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(embed_size + hidden_size, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(1)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1)

    def score(self, hidden, encoder_outputs):
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        energy = torch.bmm(v, energy)
        return energy.squeeze(1)

In [54]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import vit_b_16

In [55]:
class ViTRegressor(nn.Module):
    def __init__(self, num_outputs):
        super(ViTRegressor, self).__init__()
        self.vit = vit_b_16(pretrained=True)
        self.vit.heads = nn.Identity()
        self.fc = nn.Linear(768, num_outputs)
    def forward(self, x):
        x = self.vit(x)
        x = self.fc(x)
        return x

In [20]:
def custom_loss(outputs, labels):
    class_loss = nn.CrossEntropyLoss()(outputs, labels)
    return class_loss

In [21]:
def collate_fn(batch):
    images = [item[0] for item in batch if item[0] is not None]
    labels = [item[1] for item in batch if item[1] is not None]
    if images:
        images = torch.stack(images, dim=0)
    else:
        images = torch.empty(0)

    if labels:
        labels = torch.tensor(labels)
    else:
        labels = torch.empty(0)

    return images, labels

In [71]:
train_dataset = ImageCaptionDataset('/content/unique1.csv', 'train_images', transform=transform)
test_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, collate_fn=collate_fn)

In [59]:
model = ViTRegressor(num_outputs=1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True)

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:07<00:00, 47.3MB/s]


In [62]:
def train_vit_regressor(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            targets = targets.float()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

In [63]:
train_vit_regressor(model, test_loader, criterion, optimizer, num_epochs=10)

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Loss: 18.0972
Epoch [2/10], Loss: 14.0245
Epoch [3/10], Loss: 22.1829
Epoch [4/10], Loss: 5.4012
Epoch [5/10], Loss: 5.4116
Epoch [6/10], Loss: 6.1541
Epoch [7/10], Loss: 6.5047
Epoch [8/10], Loss: 5.9014
Epoch [9/10], Loss: 5.2637
Epoch [10/10], Loss: 5.6123


In [64]:
from difflib import get_close_matches

In [73]:
def predict_values(model, data_loader, df):
    model.eval()
    predictions = []
    idx_offset = 0

    with torch.no_grad():
        for images, _ in data_loader: # Assuming your data loader returns a tuple of (images, labels)
            if isinstance(images, tuple): # Check if images is a tuple
                images = images[0] # If it is, extract the first element (assuming it's the image tensor)
            outputs = model(images)
            outputs = outputs.squeeze(1)
            for i, output in enumerate(outputs):
                predicted_value = output.item()
                entity_name = df.iloc[idx_offset + i]['entity_name']
                actual_value = df.iloc[idx_offset + i]['entity_value']

                print(f"Entity Name: {entity_name}, Actual Value: {actual_value}, Predicted Value: {predicted_value}")
                predictions.append((entity_name, actual_value, predicted_value))

            idx_offset += len(images)
    return predictions

In [74]:
predictions = predict_values(model, test_loader, df)

Entity Name: item_weight, Actual Value: 500.0 gram, Predicted Value: 0.5642597079277039
Entity Name: item_volume, Actual Value: 1.0 cup, Predicted Value: 1.1091495752334595
Entity Name: voltage, Actual Value: 48.0 volt, Predicted Value: 0.6148980259895325
Entity Name: wattage, Actual Value: 800.0 watt, Predicted Value: 0.7674131989479065
Entity Name: maximum_weight_recommendation, Actual Value: 15 kilogram, Predicted Value: 0.5704300999641418
Entity Name: height, Actual Value: 95.0 centimetre, Predicted Value: 0.9870798587799072
Entity Name: depth, Actual Value: 21.0 centimetre, Predicted Value: 1.0076121091842651
Entity Name: width, Actual Value: 22.0 millimetre, Predicted Value: 1.0357557535171509
