In [61]:
import pandas as pd


### Exploration 
- How many images per label?
- What other information is available?

In [62]:
# Load and preview the dataset
df = pd.read_csv('data/styles.csv', usecols=range(10))
display(df.head())

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [63]:
# Explore columns
print("* UNIQUE VALUES PER COLUMN *")
print(df.nunique())

# Explore subcategories, which are the categories the model will learn
print("\n* FREQUENCY BY SUBCATEGORY *")
print(df["subCategory"].value_counts(ascending=False))


* UNIQUE VALUES PER COLUMN *
id                    44446
gender                    5
masterCategory            7
subCategory              45
articleType             143
baseColour               46
season                    4
year                     13
usage                     8
productDisplayName    31135
dtype: int64

* FREQUENCY BY SUBCATEGORY *
subCategory
Topwear                     15405
Shoes                        7344
Bags                         3055
Bottomwear                   2694
Watches                      2542
Innerwear                    1808
Jewellery                    1080
Eyewear                      1073
Fragrance                    1012
Sandal                        963
Wallets                       933
Flip Flops                    915
Belts                         811
Socks                         698
Lips                          527
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Nails                   

### Preprocessing

In [64]:
# These are the categories that we wish to use
categories = ["Topwear", "Bottomwear", "Innerwear", "Bags", "Watches", "Jewellery", "Eyewear", "Wallets", "Shoes", "Sandal", "Makeup", "Fragrance"]

# Set all subcategories not in our list to "Other"
df['category'] = df['subCategory'].apply(lambda x: x if x in categories else "Other")

# Drop all columns but the id and the category
df = df[['id', 'category']]
df.set_index('id', inplace=True)
df.sort_index(inplace=True)

display(df.head())
print("* FREQUENCY BY CATEGORY *")
print(df["category"].value_counts(ascending=False))

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
1163,Topwear
1164,Topwear
1165,Topwear
1525,Bags
1526,Bags


* FREQUENCY BY CATEGORY *
category
Topwear       15405
Shoes          7344
Other          6230
Bags           3055
Bottomwear     2694
Watches        2542
Innerwear      1808
Jewellery      1080
Eyewear        1073
Fragrance      1012
Sandal          963
Wallets         933
Makeup          307
Name: count, dtype: int64


In [70]:
df.loc[1163]

category    Topwear
Name: 1163, dtype: object

In [None]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class FashionDataset(Dataset):
    def __init__(self, df: pandas.DataFrame, root_dir, transform=None):
        """
        Args:
            dataframe (pandas.DataFrame): DataFrame with columns `id` (file name) and `category`.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.df = df
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, f"{idx}.jpg")
        image = Image.open(img_name).convert('RGB')  # Convert to RGB to ensure consistency
        label = self.df.iloc[idx, 1]  # Assuming the second column contains the labels

        if self.transform:
            image = self.transform(image)

        return image, label
