In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [3]:
data = pd.read_csv('/content/train_amazon_ml.csv')

In [4]:
data

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [5]:
def download_images(image_links, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for i, link in enumerate(image_links):
        try:
            response = requests.get(link)
            response.raise_for_status()
            parsed_url = urlparse(link)
            image_name = os.path.basename(parsed_url.path)
            image_path = os.path.join(output_dir, image_name)
            image = Image.open(BytesIO(response.content))
            image.save(image_path)
            print(f"Downloaded and saved image: {image_path}")
        except Exception as e:
            print(f"Error downloading or saving image from {link}: {e}")

In [20]:
import os
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm
from urllib.parse import urlparse
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from torch.optim import Adam

In [10]:
unique_values = OrderedDict()
b = []
for index, value in enumerate(data['entity_name']):
    if value not in unique_values:
        unique_values[value] = index
        b.append(index)

In [11]:
b

[0, 1, 25, 36, 197, 536, 670, 3205]

In [12]:
d_image1, group_id2, entity_name2, entity_value2 = [], [], [], []
for i in range(0, len(b)):
    d_image1.append(data['image_link'][b[i]])
    group_id2.append(data['group_id'][b[i]])
    entity_name2.append(data['entity_name'][b[i]])
    entity_value2.append(data['entity_value'][b[i]])

In [13]:
df = pd.DataFrame({'image_link': d_image1, 'group_id': group_id2, 'entity_name': entity_name2, 'entity_value': entity_value2})

In [14]:
df

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/71nywfWZUw...,179080,voltage,48.0 volt
3,https://m.media-amazon.com/images/I/61o2ntPNNg...,179080,wattage,800.0 watt
4,https://m.media-amazon.com/images/I/71d+dz7ogk...,150913,maximum_weight_recommendation,15 kilogram
5,https://m.media-amazon.com/images/I/610bLFQIS3...,442321,height,95.0 centimetre
6,https://m.media-amazon.com/images/I/51k7GMS8dg...,630869,depth,21.0 centimetre
7,https://m.media-amazon.com/images/I/51GzV0nG31...,675317,width,22.0 millimetre


In [15]:
df.to_csv('unique.csv', index=False)

In [21]:
download_images(df['image_link'], 'train_images')

Downloaded and saved image: train_images/61I9XdN6OFL.jpg
Downloaded and saved image: train_images/71gSRbyXmoL.jpg
Downloaded and saved image: train_images/71nywfWZUwL.jpg
Downloaded and saved image: train_images/61o2ntPNNgL.jpg
Downloaded and saved image: train_images/71d+dz7ogkL.jpg
Downloaded and saved image: train_images/610bLFQIS3L.jpg
Downloaded and saved image: train_images/51k7GMS8dgL.jpg
Downloaded and saved image: train_images/51GzV0nG31L.jpg


In [2]:
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_mapping = {name: idx for idx, name in enumerate(self.data['entity_name'].unique())}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.basename(self.data.iloc[idx, 0])
        img_path = os.path.join(self.root_dir, img_name)
        if not os.path.isfile(img_path):
            print(f"File {img_path} not found.")
            return None, None

        image = Image.open(img_path).convert('RGB')
        label = self.data.iloc[idx, 2]
        label = self.label_mapping[label]

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)

In [22]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [53]:
class VisionEncoder(nn.Module):
    def __init__(self):
        super(VisionEncoder, self).__init__()
        self.resnet = models.resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(512, 128)

    def forward(self, x):
        return self.resnet(x)

class SimpleTextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, 128) # Changed the input features to embedding_dim

    def forward(self, x):
        embedded = self.embedding(x)
        pooled = embedded.mean(dim=1)
        return self.fc(pooled)

class CLIPModel(nn.Module):
    def __init__(self, vision_encoder, text_encoder):
        super(CLIPModel, self).__init__()
        self.vision_encoder = vision_encoder
        self.text_encoder = text_encoder

    def forward(self, images, texts):
        image_features = self.vision_encoder(images)
        text_features = self.text_encoder(texts)
        return image_features, text_features

In [54]:
class SimpleTokenizer:
    def __init__(self, texts):
        self.vectorizer = CountVectorizer()
        self.vectorizer.fit(texts)
        self.vocab_size = len(self.vectorizer.vocabulary_)

    def tokenize(self, texts):
        tokens = self.vectorizer.transform(texts).toarray()
        return torch.tensor(tokens, dtype=torch.long)

def normalize_features(features):
    return normalize(features)

In [55]:
def train_clip_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in dataloader:
            if images is None:
                continue
            optimizer.zero_grad()
            image_features, text_features = model(images, labels)
            loss = criterion(image_features, text_features)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}")

def predict(model, image, text_queries, tokenizer):
    model.eval()
    with torch.no_grad():
        image = transform(image).unsqueeze(0)
        image_features = model.vision_encoder(image)

        tokens = tokenizer.tokenize(text_queries)
        text_features = model.text_encoder(tokens)

        image_features = normalize_features(image_features.numpy())
        text_features = normalize_features(text_features.numpy())

        similarities = np.dot(image_features, text_features.T)
        top_k_indices = np.argsort(similarities.flatten())[-5:]

        return top_k_indices

In [56]:
train_dataset = ImageCaptionDataset('/content/unique.csv', 'train_images', transform=transform)
test_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)

In [57]:
text_data = train_dataset.data['entity_name'].unique()
tokenizer = SimpleTokenizer(text_data)

In [58]:
vision_encoder = VisionEncoder()
text_encoder = SimpleTextEncoder(vocab_size=tokenizer.vocab_size, embedding_dim=128)
model = CLIPModel(vision_encoder, text_encoder)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [59]:
train_clip_model(model, test_loader, criterion, optimizer, num_epochs=10)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x8 and 128x128)