In [91]:
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image


In [94]:
# Load and preview the dataset
train_df = pd.read_csv('train.csv', sep='\t')
test_df = pd.read_csv('test.csv', sep='\t')

df = pd.concat([train_df, test_df], axis=0)
# df.set_index('imageid', inplace=True)
# df.sort_index(inplace=True)
display(df.head())

Unnamed: 0,imageid,label,productname
0,2653,Bags,Murcia Women Leather Office Grey Bag
1,55997,Others,Colorbar Velvet Matte Temptation Lipstick 24MA
2,2640,Shoes,Carlton London Men Brown Formal Shoes
3,40565,Topwear,W Women Maroon Kurta
4,38932,Bottomwear,Gini and Jony Girls Pink Leggings


### Preprocessing

In [93]:
# Make sure these are the only categories that appear in the dataset
labels = set({"Topwear", "Bottomwear", "Innerwear", "Bags", "Watches", "Jewellery", "Eyewear", "Wallets", "Shoes", "Sandal", "Makeup", "Fragrance", "Others"})
unique_labels = df["label"].unique()
assert(labels == set(unique_labels))

print("* FREQUENCY BY CATEGORY *")
print(df["label"].value_counts(ascending=False))

* FREQUENCY BY CATEGORY *
label
Topwear       15401
Shoes          7344
Others         6230
Bags           3055
Bottomwear     2693
Watches        2542
Innerwear      1808
Jewellery      1080
Eyewear        1073
Fragrance      1012
Sandal          963
Wallets         933
Makeup          307
Name: count, dtype: int64


### Load the data

In [99]:
class FashionDataset(Dataset):
    def __init__(self, csv_file, images_dir, transform=None):
        """
        Args:
            csv_file (string): path to csv file with `imageid` (file name) and `label`.
            images_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.df = pd.read_csv(csv_file, sep='\t')
        self.images_dir = images_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = f"{self.df.iloc[idx, 0]}.jpg"
        img_path = os.path.join(self.images_dir, img_name)
        image = Image.open(img_path)
        label = self.df.iloc[idx, 1]  # label is the second column

        if self.transform:
            image = self.transform(image)

        return image, label

train_data = FashionDataset("train.csv", "images", transform=None)
test_data = FashionDataset("test.csv", "images", transform=None)

print(len(train_data), "training samples")
print(len(test_data), "testing samples")

40441 training samples
4000 testing samples
