# Data Analysis for the Fashion Compatibility Datathon

We will perform some initial data analysis of both datasets `product_data.csv` and `outfit_data.csv`:

In [2]:
import pandas as pd
import numpy as np

In [3]:
# load both tables and explore its features
product_columns = ["cod_modelo_color","cod_color_code","des_color_specification_esp","des_agrup_color_eng","des_sex","des_age","des_line","des_fabric","des_product_category","des_product_aggregated_family","des_product_family","des_product_type","des_filename"]
outfit_columns = ["cod_outfit", "cod_modelo_color"]

product_data = pd.read_csv('./datathon/dataset/product_data.csv')
outfit_data = pd.read_csv('./datathon/dataset/outfit_data.csv')

In [4]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9222 entries, 0 to 9221
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   cod_modelo_color               9222 non-null   object
 1   cod_color_code                 9222 non-null   object
 2   des_color_specification_esp    9222 non-null   object
 3   des_agrup_color_eng            9222 non-null   object
 4   des_sex                        9222 non-null   object
 5   des_age                        9222 non-null   object
 6   des_line                       9222 non-null   object
 7   des_fabric                     9222 non-null   object
 8   des_product_category           9222 non-null   object
 9   des_product_aggregated_family  9222 non-null   object
 10  des_product_family             9222 non-null   object
 11  des_product_type               9222 non-null   object
 12  des_filename                   9222 non-null   object
dtypes: 

In [5]:
product_data.value_counts(['des_line', 'des_sex', 'des_age'])

des_line  des_sex  des_age
SHE       Female   Adult      9144
HOME      Unisex   Adult        50
KIDS      Female   Kids         19
VIOLETA   Female   Adult         7
HE        Male     Adult         2
Name: count, dtype: int64

In [6]:
product_data.value_counts(['des_fabric'])

des_fabric    
P-PLANA           3041
C-COMPLEMENTOS    2569
K-CIRCULAR        1564
T-TRICOT          1274
J-JEANS            606
O-POLIPIEL          97
L-PIEL              71
Name: count, dtype: int64

In [8]:
merged_data = pd.merge(outfit_data, product_data)
merged_data

Unnamed: 0,cod_outfit,cod_modelo_color,cod_color_code,des_color_specification_esp,des_agrup_color_eng,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename
0,1,51000622-02,02,OFFWHITE,WHITE,Female,Adult,SHE,K-CIRCULAR,Tops,Tops,Tops,Top,datathon/images/2019_51000622_02.jpg
1,2086,51000622-02,02,OFFWHITE,WHITE,Female,Adult,SHE,K-CIRCULAR,Tops,Tops,Tops,Top,datathon/images/2019_51000622_02.jpg
2,1,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
3,145,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
4,626,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43577,7497,57041183-06,06,PIEDRA,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Curtain,datathon/images/2023_57041183_06.jpg
43578,7085,57074425-08,08,BEIGE,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Carpet Yarn,datathon/images/2023_57074425_08.jpg
43579,7497,57074425-08,08,BEIGE,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Carpet Yarn,datathon/images/2023_57074425_08.jpg
43580,7373,57089203-99,99,NEGRO,GREY,Female,Adult,SHE,P-PLANA,Tops,Tops,Tops,Top,datathon/images/2019_57089203_99.jpg


In [9]:
outfit_data.groupby('cod_outfit').count().value_counts().sort_index()

cod_modelo_color
2                     30
3                    142
4                    758
5                   3875
6                   1737
7                    674
8                    309
9                    174
10                    95
11                    26
12                    13
13                     8
15                     1
Name: count, dtype: int64

### Trying to apply ARL
https://medium.com/mlearning-ai/recommendation-systems-arl-association-rule-learning-bed1a07b5d9a

In [18]:
# Get the COLOR counts for each outfit
# pivot_outfit_colors = pd.pivot_table(merged_data[['cod_outfit', 'cod_color_code', 'des_agrup_color_eng']], index=['cod_outfit'], columns=['des_agrup_color_eng'], aggfunc="count", fill_value = 0.0)
outfit_color_counts = merged_data.groupby(['cod_outfit', 'des_agrup_color_eng'])['des_agrup_color_eng'].count().unstack().fillna(0).map(lambda x: x > 0.0)
outfit_color_counts

des_agrup_color_eng,BLUE,BROWN,GREEN,GREY,ORANGE,PINK,PURPLE,RED,WHITE,YELLOW
cod_outfit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,True,False
3,True,False,False,True,False,False,False,True,True,False
4,True,False,False,False,False,True,False,False,True,True
5,False,True,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
7838,False,True,False,True,False,False,False,False,True,False
7839,False,True,False,True,False,False,False,False,True,False
7840,False,False,False,True,False,False,False,False,True,False
7841,False,False,True,True,False,False,False,False,True,False


In [34]:
from mlxtend.frequent_patterns import apriori, association_rules

outfit_frequent_colors = apriori(outfit_color_counts, min_support=0.01, use_colnames=True).sort_values("support", ascending=False)
outfit_frequent_colors

Unnamed: 0,support,itemsets
8,0.849656,(WHITE)
3,0.768426,(GREY)
31,0.627008,"(GREY, WHITE)"
1,0.308977,(BROWN)
0,0.284239,(BLUE)
...,...,...
35,0.011859,"(PURPLE, WHITE)"
40,0.011222,"(BLUE, GREY, GREEN)"
61,0.010967,"(BLUE, GREY, WHITE, PINK)"
13,0.010457,"(BLUE, ORANGE)"


In [44]:
outfit_color_rules = association_rules(outfit_frequent_colors, metric="support", min_threshold=0.01)
outfit_interesting_rules = outfit_color_rules[
	(outfit_color_rules["support"] > 0.1)
    & (outfit_color_rules["confidence"] > 0.1)
    #& (outfit_color_rules["leverage"] > 1)
].sort_values(['confidence', 'support', 'lift'], ascending=False)
outfit_interesting_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(BROWN),(WHITE),0.308977,0.849656,0.281943,0.912505,1.073971,0.019419,1.718322,0.099672
10,"(BROWN, GREY)",(WHITE),0.199821,0.849656,0.175083,0.876197,1.031237,0.005303,1.214379,0.037855
4,(BLUE),(WHITE),0.284239,0.849656,0.244453,0.860027,1.012206,0.002948,1.074094,0.016848
16,"(BLUE, GREY)",(WHITE),0.210916,0.849656,0.174318,0.826481,0.972725,-0.004888,0.866444,-0.034315
0,(GREY),(WHITE),0.768426,0.849656,0.627008,0.815964,0.960347,-0.025889,0.816929,-0.151323
6,(BLUE),(GREY),0.284239,0.768426,0.210916,0.742037,0.965658,-0.007501,0.8977,-0.047335
1,(WHITE),(GREY),0.849656,0.768426,0.627008,0.737956,0.960347,-0.025889,0.88372,-0.215464
17,"(BLUE, WHITE)",(GREY),0.244453,0.768426,0.174318,0.713093,0.927992,-0.013526,0.807139,-0.093136
8,(BROWN),(GREY),0.308977,0.768426,0.199821,0.646719,0.841615,-0.037605,0.655494,-0.214046
11,"(BROWN, WHITE)",(GREY),0.281943,0.768426,0.175083,0.620986,0.808127,-0.04157,0.610989,-0.248491
