## In this notebook, we'll try to get to know about our H&M dataset.
> 📝 **Note:** The EDA strategies are from my instructor. Because of my limited knowledge, feel free to add more !

## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import math
import sys

import datasets
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datasets import load_dataset
from evidently.metric_preset import DataQualityPreset
from evidently.report import Report
from plotly.subplots import make_subplots
from pydantic import BaseModel

datasets.logging.set_verbosity_error()

sys.path.insert(0, "..")

from src.utils.data_prep import parse_dt

# Set up plot styling
from src.visualization.setup import color_scheme

## Configuration

In [3]:
from src.config.dataset_config import dataset_config

class Args(BaseModel):
    hf_dataset_path: str = dataset_config.HF_DATASET_PATH
    hf_dataset_token: str = dataset_config.HF_TOKEN
    report_sample_num_rows: int = 10000  #sampling data in order to simulate real distribution
    random_seed: int = 41


args = Args()

[32m2024-12-09 15:02:34.145[0m | [1mINFO    [0m | [36msrc.config.dataset_config[0m:[36m<module>[0m:[36m29[0m - [1mDataset Path: dinhlnd1610/HM-Personalized-Fashion-Recommendations[0m


## Load data

In [4]:
article_metadata = load_dataset(
    args.hf_dataset_path,
    token= args.hf_dataset_token,
    name="articles",
    split="train",
    trust_remote_code=True,
)

In [5]:
article_metadata[0]

{'article_id': 108775015,
 'product_code': 108775,
 'prod_name': 'Strap top',
 'product_type_no': 253,
 'product_type_name': 'Vest top',
 'product_group_name': 'Garment Upper body',
 'graphical_appearance_no': 1010016,
 'graphical_appearance_name': 'Solid',
 'colour_group_code': 9,
 'colour_group_name': 'Black',
 'perceived_colour_value_id': 4,
 'perceived_colour_value_name': 'Dark',
 'perceived_colour_master_id': 5,
 'perceived_colour_master_name': 'Black',
 'department_no': 1676,
 'department_name': 'Jersey Basic',
 'index_code': 'A',
 'index_name': 'Ladieswear',
 'index_group_no': 1,
 'index_group_name': 'Ladieswear',
 'section_no': 16,
 'section_name': 'Womens Everyday Basics',
 'garment_group_no': 1002,
 'garment_group_name': 'Jersey Basic',
 'detail_desc': 'Jersey top with narrow shoulder straps.'}

- Small experiment : See the impact of num_proc

In [6]:
%%time

transactions = load_dataset(
    args.hf_dataset_path,
    token= args.hf_dataset_token,
    name="transactions",
    split="train",
    trust_remote_code=True,
    num_proc = 8
)

CPU times: user 976 ms, sys: 2.32 s, total: 3.3 s
Wall time: 6.15 s


In [7]:
%%time

transactions = load_dataset(
    args.hf_dataset_path,
    token= args.hf_dataset_token,
    name="transactions",
    split="train",
    trust_remote_code=True,
    num_proc = 1
)

CPU times: user 901 ms, sys: 1.65 s, total: 2.55 s
Wall time: 6.42 s


In [10]:
transactions

Dataset({
    features: ['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id'],
    num_rows: 31788324
})

In [12]:
transactions[0]

{'t_dat': '2018-09-20',
 'customer_id': '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318',
 'article_id': 663713001,
 'price': 0.0508305084745762,
 'sales_channel_id': 2}

In [13]:
customer_metadata = load_dataset(
    args.hf_dataset_path,
    token= args.hf_dataset_token,
    name="customers",
    split="train",
    trust_remote_code=True,
    num_proc = 8
)

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1371980 [00:00<?, ? examples/s]

In [14]:
customer_metadata[0]

{'customer_id': '00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657',
 'FN': None,
 'Active': None,
 'club_member_status': 'ACTIVE',
 'fashion_news_frequency': 'NONE',
 'age': 49.0,
 'postal_code': '52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100'}

In [21]:
train_raw_df = transactions.to_pandas()

In [22]:
train_raw_df.head(5)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [17]:
article_metadata_raw_df = article_metadata.to_pandas()

In [20]:
article_metadata_raw_df.head(5)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [25]:
# Convert timpestamp column (series) to pandas date-time format
train_df = train_raw_df.pipe(parse_dt)

In [29]:
train_df.head(5)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## Basic data quality report with Eviently AI

In [30]:
# Sampling data without replacing original data
train_sample_df = train_df.sample(
    args.report_sample_num_rows, replace=False, random_state=args.random_seed
)

In [31]:
data_quality_report = Report(
    metrics=[
        DataQualityPreset(),
    ],
    options=[color_scheme],
)

data_quality_report.run(reference_data=None, current_data=train_sample_df)
data_quality_report.save_html("./data_quality_report.html")

## Distribution per time

In [35]:
def plot_interaction_by_dayofweek(df):
    plot_df = df.assign(
        day_of_week=df["t_dat"]
        .dt.day_name()                 # Get the day in week
    )

    interaction_count_by_day = (
        plot_df.groupby("day_of_week")
        .size()
        .reindex(
            [
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
                "Sunday",
            ]
        )
    )

    # Create the plot
    fig = px.bar(
        x=interaction_count_by_day.index,
        y=interaction_count_by_day.values,
        labels={"x": "Day of the Week", "y": "Number of Interactions"},
        title=f"Interaction Count by Day of the Week",
        text=[f"{val:,.0f}" for val in interaction_count_by_day.values],
        height=500,
        width=700,
    )

    fig.update_layout()

    fig.show()

plot_interaction_by_dayofweek(train_sample_df)

In [39]:
def plot_interaction_by_month(df):
    # Extract the month from the timestamp
    plot_df = df.assign(
        month_of_year=df["t_dat"]
        .dt.month
    )

    # Group by the month and count the number of interactions
    interaction_count_by_month = (
        plot_df.groupby("month_of_year").size().reindex(range(1, 13), fill_value=0)
    )

    # Create the plot
    fig = px.bar(
        x=interaction_count_by_month.index,
        y=interaction_count_by_month.values,
        labels={"x": "Month", "y": "Number of Interactions"},
        title=f"Interaction Count by Month",
        text=[f"{val:,.0f}" for val in interaction_count_by_month.values],
        height=500,
        width=1200,
    )

    fig.update_layout(
        xaxis_tickmode="linear",
        xaxis_tickvals=interaction_count_by_month.index,
        xaxis_ticktext=[
            pd.to_datetime(month, format="%m").strftime("%B")
            for month in interaction_count_by_month.index
        ],
    )

    fig.show()


plot_interaction_by_month(train_sample_df)

## Sparsity

In [40]:
def calculate_sparsity(df, user_col="customer_id", item_col="article_id"):
    return 1 - df.shape[0] / (df[user_col].nunique() * df[item_col].nunique())


print(f"Sparsity: {calculate_sparsity(train_df):,.4%}")

Sparsity: 99.9777%


## User-Item distribution

In [52]:
def plot_user_rating_distribution(df, col = "customer_id", width = 12000):
    user_rating_counts = df.groupby(col).size().reset_index(name = "rating_count")

    fig = px.histogram(
        user_rating_counts,
        x = "rating_count",
        labels={"rating_count": "Number of Buying"},
        title=f"Number of Ratings per {col}",
        text_auto=True,
        height=500,
        width=width,
    )

    fig.update_yaxes(title_text=f"Number of {col}")

    fig.show()

plot_user_rating_distribution(train_sample_df, col="customer_id", width=None)
plot_user_rating_distribution(train_sample_df, col="article_id", width=None)

## Most popular items

In [68]:
popular_items_df = (
    train_sample_df.groupby(["article_id"], as_index = False)      # do not create article_id as index, so when .size(), we can obtain DataFrame
    .size()
    .pipe(lambda df : pd.merge(df, article_metadata_raw_df, how = "left", on = "article_id"))
    .sort_values("size", ascending = False)[["article_id", "product_type_name", "prod_name"]]
    .head(5)
)

with pd.option_context("display.max_colwidth", None):
    display(popular_items_df)

Unnamed: 0,article_id,product_type_name,prod_name
3429,706016001,Trousers,Jade HW Skinny Denim TRS
165,372860002,Socks,7p Basic Shaftless
3430,706016002,Trousers,Jade HW Skinny Denim TRS
4994,759871002,Vest top,Tilda tank
723,554450001,Trousers,Julia RW Skinny Denim TRS
