# Data Analysis for the Fashion Compatibility Datathon

We will perform some initial data analysis of both datasets `product_data.csv` and `outfit_data.csv`:

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load both tables and explore its features
product_columns = ["cod_modelo_color","cod_color_code","des_color_specification_esp","des_agrup_color_eng","des_sex","des_age","des_line","des_fabric","des_product_category","des_product_aggregated_family","des_product_family","des_product_type","des_filename"]
outfit_columns = ["cod_outfit", "cod_modelo_color"]

product_data = pd.read_csv('./datathon/dataset/product_data.csv')
outfit_data = pd.read_csv('./datathon/dataset/outfit_data.csv')

In [3]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9222 entries, 0 to 9221
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   cod_modelo_color               9222 non-null   object
 1   cod_color_code                 9222 non-null   object
 2   des_color_specification_esp    9222 non-null   object
 3   des_agrup_color_eng            9222 non-null   object
 4   des_sex                        9222 non-null   object
 5   des_age                        9222 non-null   object
 6   des_line                       9222 non-null   object
 7   des_fabric                     9222 non-null   object
 8   des_product_category           9222 non-null   object
 9   des_product_aggregated_family  9222 non-null   object
 10  des_product_family             9222 non-null   object
 11  des_product_type               9222 non-null   object
 12  des_filename                   9222 non-null   object
dtypes: 

In [4]:
product_data.value_counts(['des_line', 'des_sex', 'des_age'])

des_line  des_sex  des_age
SHE       Female   Adult      9144
HOME      Unisex   Adult        50
KIDS      Female   Kids         19
VIOLETA   Female   Adult         7
HE        Male     Adult         2
Name: count, dtype: int64

In [5]:
product_data.value_counts(['des_fabric'])

des_fabric    
P-PLANA           3041
C-COMPLEMENTOS    2569
K-CIRCULAR        1564
T-TRICOT          1274
J-JEANS            606
O-POLIPIEL          97
L-PIEL              71
Name: count, dtype: int64

In [6]:
merged_data = pd.merge(outfit_data, product_data)
merged_data

Unnamed: 0,cod_outfit,cod_modelo_color,cod_color_code,des_color_specification_esp,des_agrup_color_eng,des_sex,des_age,des_line,des_fabric,des_product_category,des_product_aggregated_family,des_product_family,des_product_type,des_filename
0,1,51000622-02,02,OFFWHITE,WHITE,Female,Adult,SHE,K-CIRCULAR,Tops,Tops,Tops,Top,datathon/images/2019_51000622_02.jpg
1,2086,51000622-02,02,OFFWHITE,WHITE,Female,Adult,SHE,K-CIRCULAR,Tops,Tops,Tops,Top,datathon/images/2019_51000622_02.jpg
2,1,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
3,145,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
4,626,43067759-01,01,BLANCO,WHITE,Female,Adult,SHE,P-PLANA,Bottoms,Skirts and shorts,Skirts,Skirt,datathon/images/2019_43067759_01.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43577,7497,57041183-06,06,PIEDRA,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Curtain,datathon/images/2023_57041183_06.jpg
43578,7085,57074425-08,08,BEIGE,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Carpet Yarn,datathon/images/2023_57074425_08.jpg
43579,7497,57074425-08,08,BEIGE,WHITE,Unisex,Adult,HOME,P-PLANA,Home,Decor,Deco Textiles,Carpet Yarn,datathon/images/2023_57074425_08.jpg
43580,7373,57089203-99,99,NEGRO,GREY,Female,Adult,SHE,P-PLANA,Tops,Tops,Tops,Top,datathon/images/2019_57089203_99.jpg


In [7]:
outfit_data.groupby('cod_outfit').count().value_counts().sort_index()

cod_modelo_color
2                     30
3                    142
4                    758
5                   3875
6                   1737
7                    674
8                    309
9                    174
10                    95
11                    26
12                    13
13                     8
15                     1
Name: count, dtype: int64

### Trying to apply ARL
https://medium.com/mlearning-ai/recommendation-systems-arl-association-rule-learning-bed1a07b5d9a

In [11]:
# Get the COLOR counts for each outfit
pivot_outfit_colors = merged_data.groupby(['cod_outfit', 'des_agrup_color_eng'])['des_agrup_color_eng'].count().unstack().fillna(0).apply(lambda x: 1 if x >= 0.0 else 0)
# pivot_outfit_colors = pd.pivot_table(merged_data[['cod_outfit', 'cod_color_code', 'des_agrup_color_eng']], index=['cod_outfit'], columns=['des_agrup_color_eng'], aggfunc="count", fill_value = 0.0)

pivot_outfit_colors

des_agrup_color_eng,BLUE,BROWN,GREEN,GREY,ORANGE,PINK,PURPLE,RED,WHITE,YELLOW
cod_outfit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,4.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,1.0
5,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7838,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0
7839,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0
7840,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0
7841,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0


In [12]:
from mlxtend.frequent_patterns import apriori, association_rules

apriori(pivot_outfit_colors, min_support=0.01, use_colnames=True)



ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value 7.0

In [None]:
frequent_itemsets = apriori(fr_inv_pro_df,
                            min_support=0.01,
                            use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False)