In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
import os
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (8,4)

In [None]:
df = pd.read_csv('styles.csv')
df["year"] = df["year"].astype("Int64")
df.head()

In [None]:
df.columns = df.columns.str.strip()
print("Data shape:", df.shape)

# Unique counts per column
print("\nUnique values per column:")
print(df.nunique())

# Duplicates check
print("\nDuplicate rows:", df.duplicated().sum())

# ID integrity
print("\nID duplicated:", df["id"].duplicated().sum())

In [None]:
df.dropna( inplace=True)

In [None]:
categorical_cols = df.select_dtypes(include="object").columns.tolist()
numerical_cols = df.select_dtypes(include=["int64", "float64", "Int64"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

In [None]:
for col in ["gender", "masterCategory", "usage"]:
    plt.figure()
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
# tEMPORAL STUDY
sns.countplot(data=df, x="year")
plt.title("Product count per year")
plt.xticks(rotation=45)
plt.show()

# Season vs Year matrix
pivot = pd.crosstab(df["year"], df["season"])
sns.heatmap(pivot, cmap="coolwarm", annot=True, fmt="d")
plt.title("Season vs Year Heatmap")
plt.show()

In [None]:
# Correlation matrix for encoded categorical features
encoded = df.copy()
enc = LabelEncoder()
for col in categorical_cols:
    encoded[col] = enc.fit_transform(encoded[col].astype(str))
corr = encoded.corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Encoded Feature Correlation Matrix")
plt.show()

In [None]:
# Cross distributions
cat_cross = pd.crosstab(df["gender"], df["masterCategory"], normalize="index")
cat_cross.plot(kind="bar", stacked=True, colormap="viridis")
plt.title("Gender vs Master Category (Proportions)")
plt.show()



Decisions based on this EDA to have a balanced dataset where without a dominant category or year:
- We are going to restrict the "gender" to Women and Men since Boys, girls and Unisex represents a small proportion of the articles.
- For "masterCategory" we are only going to stay with 'Apparel', 'Accessories', 'Footwear' and 'Personal Care'
- As we can see in the heatmap and in the year distribution, 2011 and 2012 data will be enough representative 

We go from shape 44446 rows dataset to a restricted one with only 27144

In [None]:
# Restrictive dataset
df_restricted = df[df["gender"].isin(["Men", "Women"]) 
                   & df["masterCategory"].isin(["Apparel", "Accessories", "Footwear", "Personal care"])
                   & df["year"].isin([2011, 2012])]
print("Restricted Data shape:", df_restricted.shape)
df_restricted.head()

In [None]:
df_restricted.to_csv("styles_cleaned.csv", index=False)

In [None]:
# removed ids
removed_ids = set(df['id']) - set(df_restricted['id'])
print("Number of removed IDs:", len(removed_ids))

image_files = os.listdir("images")
print("Total images in 'images/' folder:", len(image_files))

In [None]:
# remove from the "images/" folder the images called removed_ids.jpg
for i in list(removed_ids):
    path = os.path.join("images", f"{i}.jpg")
    if os.path.exists(path):
        os.remove(path)
image_files = os.listdir("images")
print("Total images in 'images/' folder:", len(image_files))