In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import torch
from torchvision import models, transforms
from PIL import Image
from transformers import CLIPModel
from transformers import CLIPProcessor
import time
import faiss

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: /Users/chiara/opt/anaconda3/envs/myenv/lib/python3.10/site-packages/torchvision/image.so
  warn(


In [4]:
filename = "./archive/myntradataset/styles.csv"
df = pd.read_csv(filename, on_bad_lines="skip") #some lines of the dataset fail due to excess commas

available_ids = os.listdir("./archive/myntradataset/images")
available_ids = [int(x.replace(".jpg","")) for x in available_ids]
df = df[df.id.isin(available_ids)] #some images are not actually available
df=df.dropna(subset='productDisplayName')

df.head()


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


### Data Exploration

In [5]:
#Find all unique categories from the master category column and the number of time each appears
value_counts_master = df['masterCategory'].value_counts()
print(value_counts_master)

masterCategory
Apparel           21392
Accessories       11272
Footwear           9219
Personal Care      2398
Free Items          105
Sporting Goods       25
Home                  1
Name: count, dtype: int64


In [6]:
#Find all unique categories from the sub category column and the number of time each appears
value_counts_sub = df['subCategory'].value_counts()
print(value_counts_sub)

subCategory
Topwear                     15398
Shoes                        7343
Bags                         3053
Bottomwear                   2693
Watches                      2542
Innerwear                    1808
Jewellery                    1079
Eyewear                      1073
Fragrance                    1006
Sandal                        963
Wallets                       933
Flip Flops                    913
Belts                         811
Socks                         698
Lips                          527
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Nails                         329
Makeup                        307
Headwear                      293
Ties                          258
Accessories                   129
Scarves                       118
Cufflinks                     108
Apparel Set                   106
Free Gifts                    104
Stoles                         90
Skin Care                      77
Sk

In [7]:
#Find all unique categories from the article type column and the number of time each appears
value_counts_article = df['articleType'].value_counts()
value_counts_article.head(20)
#value_counts_article.tail(50)

articleType
Tshirts                  7066
Shirts                   3215
Casual Shoes             2845
Watches                  2542
Sports Shoes             2036
Kurtas                   1844
Tops                     1762
Handbags                 1759
Heels                    1323
Sunglasses               1073
Wallets                   936
Flip Flops                914
Sandals                   897
Briefs                    849
Belts                     813
Backpacks                 724
Socks                     686
Formal Shoes              637
Jeans                     608
Perfume and Body Mist     608
Name: count, dtype: int64

In [8]:
#Find all unique categories from the gender column and the number of time each appears
value_counts_gender = df['gender'].value_counts()
value_counts_gender.head()

gender
Men       22139
Women     18627
Unisex     2161
Boys        830
Girls       655
Name: count, dtype: int64

In [9]:
#List of unique colors in baseColour column
unique_values = df['baseColour'].unique()
print(unique_values)
np.shape(unique_values) #47 colors

['Navy Blue' 'Blue' 'Silver' 'Black' 'Grey' 'Green' 'Purple' 'White'
 'Beige' 'Brown' 'Bronze' 'Teal' 'Copper' 'Pink' 'Off White' 'Maroon'
 'Red' 'Khaki' 'Orange' 'Coffee Brown' 'Yellow' 'Charcoal' 'Gold' 'Steel'
 'Tan' 'Multi' 'Magenta' 'Lavender' 'Sea Green' 'Cream' 'Peach' 'Olive'
 'Skin' 'Burgundy' 'Grey Melange' 'Rust' 'Rose' 'Lime Green' 'Mauve'
 'Turquoise Blue' 'Metallic' 'Mustard' 'Taupe' 'Nude' 'Mushroom Brown' nan
 'Fluorescent Green']


(47,)

In [10]:
#Find all unique categories from the base colour column and the number of time each appears
value_counts_color = df['baseColour'].value_counts()
value_counts_color.head()

baseColour
Black    9727
White    5538
Blue     4917
Brown    3493
Grey     2741
Name: count, dtype: int64

In [11]:
shirts = df.loc[df['articleType'] == 'Shirts']
np.shape(shirts)
shirts.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012.0,Ethnic,Fabindia Men Striped Green Shirt
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012.0,Casual,Jealous 21 Women Purple Shirt
15,12369,Men,Apparel,Topwear,Shirts,Purple,Fall,2011.0,Formal,Reid & Taylor Men Check Purple Shirts
30,37812,Men,Apparel,Topwear,Shirts,Navy Blue,Summer,2012.0,Formal,John Players Men Navy Blue Shirt


In [12]:
#Sort by red and shirt
red_shirts = df.loc[(df['articleType'] == 'Shirts') & (df['baseColour'] == 'Red')]
red_shirts.head()
#np.shape(red_shirts) #270

#Sort by purple and jacket
purple_jacket = df.loc[(df['articleType'] == 'Jacket') & (df['baseColour'] == 'Purple')]
np.shape(purple_jacket) #10

(0, 10)

In [17]:
#Search of leather packbacks
search_string = "leather"
result = df[df.apply(lambda row: row.astype(str).str.contains(search_string, case=False).any(), axis=1)]
result[result['articleType'] == 'Backpacks'] #only 2 products

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
14794,20938,Men,Accessories,Bags,Backpacks,Black,Winter,2016.0,Casual,Fastrack Men Leatherette Black Backpack
28213,25206,Women,Accessories,Bags,Backpacks,Beige,Winter,2015.0,Casual,Lino Perros Women Leatherite Beige Backpack
