In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import entropy

import utils.utils_features_stores as utils_features_stores
import utils.utils_features_seasonality as utils_features_seasonality

In [2]:
cols_numerical = ["quantity", "pvp"]
cols_categoprical = ["product_id", "store", "seasonality", "brand", "family"]
cols_date = ["date"]

In [3]:
df_sales = pd.read_parquet("data/sales.parquet")
df_products = pd.read_parquet("data/products.parquet")
df = df_sales.merge(df_products, on="product_id")

In [4]:
df.head()

Unnamed: 0,product_id,fecha,store,quantity,seasonality,brand,family,pvp
0,883A4A2507184C5483FDC9FC7E48240B,2021-03-22,FR,1.0,N-A,adidas,Zapatillas Trail Running,130.0
1,883A4A2507184C5483FDC9FC7E48240B,2021-03-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
2,883A4A2507184C5483FDC9FC7E48240B,2021-08-27,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
3,883A4A2507184C5483FDC9FC7E48240B,2021-06-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
4,883A4A2507184C5483FDC9FC7E48240B,2021-03-05,IT,1.0,N-A,adidas,Zapatillas Trail Running,130.0


In [5]:
# Rename columns
df = df.rename(columns={"fecha": "date"})

# Types
# for col in cols_categoprical:
#     df[col] = df[col].astype("category")
#     df[col] = df[col].cat.remove_unused_categories()
for col in cols_numerical:
    df[col] = df[col].astype("float32")
for col in cols_date:
    df[col] = pd.to_datetime(df[col])

In [6]:
df

Unnamed: 0,product_id,date,store,quantity,seasonality,brand,family,pvp
0,883A4A2507184C5483FDC9FC7E48240B,2021-03-22,FR,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
1,883A4A2507184C5483FDC9FC7E48240B,2021-03-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
2,883A4A2507184C5483FDC9FC7E48240B,2021-08-27,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
3,883A4A2507184C5483FDC9FC7E48240B,2021-06-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
4,883A4A2507184C5483FDC9FC7E48240B,2021-03-05,IT,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
...,...,...,...,...,...,...,...,...
62809,F438E725631D4B72819E4B2E567B7C14,2023-03-29,ES,1.0,N-A,Columbia,Zapatillas de montaña,99.949997
62810,70732AD67F17451387FC89F1E576A525,2021-07-05,ES,1.0,N-A,Columbia,Zapatillas de montaña,80.000000
62811,622EF83B0C284654A91A8DBFD176471D,2021-07-10,FR,1.0,N-A,Columbia,Zapatillas de montaña,74.949997
62812,8349E74BA6BC4940A06DBD3088332A4C,2023-03-03,ES,1.0,N-A,Columbia,Zapatillas de montaña,35.000000


In [7]:
# Aggregate sales to Brand + Family Level
df_grouped = df.groupby(['brand', 'family', 'date']).agg(
    total_quantity=('quantity', 'sum'), 
    total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp'])) # Weighted revenue
).reset_index()

# Compute average weighted PVP per brand+family
df_grouped['avg_pvp'] = df_grouped['total_revenue'] / df_grouped['total_quantity']

In [None]:
# Store related features
df_grouped = utils_features_stores.add_num_stores(df, df_grouped)
df_grouped = utils_features_stores.add_store_sales_concentration(df, df_grouped)
df_grouped = utils_features_stores.add_top_store_sales_ratio(df, df_grouped)
df_grouped = utils_features_stores.add_top_3_store_sales_ratio(df, df_grouped)
df_grouped = utils_features_stores.add_avg_sales_per_store(df_grouped)

# Seasonality related features
df_grouped = utils_features_seasonality.add_mode_seasonality(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_sales_concentration(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_change(df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_ratios(df, df_grouped)

In [12]:
df_grouped

Unnamed: 0,brand,family,date,total_quantity,total_revenue,avg_pvp
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,99.949997
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,180.000000
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,130.000000
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,130.000000
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,130.000000
...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,99.949997
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,99.949997
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,99.949997
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,99.949997


In [14]:
# Aggregate daily data to weekly level
df_weekly = df.groupby(['brand', 'family', pd.Grouper(key='date', freq='W-SUN')]).agg(
    total_quantity=('quantity', 'sum'),
    total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp']))
).reset_index()

In [15]:
df_weekly.head()

Unnamed: 0,brand,family,date,total_quantity,total_revenue
0,Columbia,Bañadores Natación,2021-09-19,1.0,99.949997
1,Columbia,Bañadores Natación,2024-07-07,1.0,180.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.0
3,Columbia,Botas Urban,2023-12-31,1.0,130.0
4,Columbia,Botas Urban,2024-01-07,1.0,130.0


In [16]:
# Compute store-level features before resampling
df_store_features = df.groupby(['brand', 'family', 'date', 'store']).agg(
    store_sales=('quantity', 'sum')
).reset_index()

In [17]:
df_store_features

Unnamed: 0,brand,family,date,store,store_sales
0,Columbia,Bañadores Natación,2021-09-14,ES,1.0
1,Columbia,Bañadores Natación,2024-07-02,PT,1.0
2,Columbia,Botas Urban,2023-12-03,DE,1.0
3,Columbia,Botas Urban,2023-12-29,DE,1.0
4,Columbia,Botas Urban,2024-01-01,ES,1.0
...,...,...,...,...,...
36886,adidas,Zuecos Urban,2023-11-10,PT,1.0
36887,adidas,Zuecos Urban,2024-05-24,ES,1.0
36888,adidas,Zuecos Urban,2024-05-30,DE,1.0
36889,adidas,Zuecos Urban,2024-06-12,ES,1.0


In [57]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

def aggregate_data(df, frequency='D'):
    """Aggregates total quantity and revenue at the specified time frequency."""
    df['date'] = pd.to_datetime(df['date'])
    
    df_grouped = df.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        total_quantity=('quantity', 'sum'),
        total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp']))
    ).reset_index()

    return df_grouped

def compute_store_features(df, frequency='D'):
    """Computes store-related features including store concentration and entropy."""
    df_store_features = df.groupby(['brand', 'family', 'date', 'store']).agg(
        store_sales=('quantity', 'sum')
    ).reset_index()

    df_store_features['date'] = pd.to_datetime(df_store_features['date'])

    df_store_grouped = df_store_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        num_stores=('store', 'nunique'),
        store_entropy=('store', lambda x: entropy(x.value_counts(normalize=True)) if len(x.unique()) > 1 else 0),
        # store_entropy=('store_sales', lambda x: entropy(x.value_counts(normalize=True)) if len(x) > 1 else 0),
        avg_sales_per_store=('store_sales', 'mean'),
        top_store_sales=('store_sales', 'max'),
        top_3_store_sales=('store_sales', lambda x: x.nlargest(3).sum()),
        top_store_sales_ratio=('store_sales', lambda x: x.max() / x.sum() if x.sum() > 0 else 0)
    ).reset_index()

    return df_store_grouped

def compute_seasonality_features(df, frequency='D'):
    """Computes seasonality-related features including entropy and mode seasonality."""
    df_seasonality_features = df.groupby(['brand', 'family', 'date', 'seasonality']).agg(
        season_sales=('quantity', 'sum')
    ).reset_index()

    df_seasonality_features['date'] = pd.to_datetime(df_seasonality_features['date'])

    # Compute seasonality ratios
    df_pivot = df_seasonality_features.pivot_table(
        index=['brand', 'family', 'date'],
        columns='seasonality',
        values='season_sales',
        aggfunc='sum',
        fill_value=0
    )

    df_pivot = df_pivot.div(df_pivot.sum(axis=1), axis=0).fillna(0)
    df_pivot.columns = [f'seasonality_ratio_{col}' for col in df_pivot.columns]

    df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        seasonality_entropy=('season_sales', lambda x: entropy(x.value_counts(normalize=True)) if len(x) > 1 else 0),
        mode_seasonality=('seasonality', lambda x: x.mode()[0] if not x.mode().empty else 'N-A')
    ).reset_index()

    return df_seasonality_grouped.merge(df_pivot.reset_index(), on=['brand', 'family', 'date'], how='left')

def compute_seasonality_change(df_grouped):
    """Computes how frequently seasonality changes over time."""
    df_grouped['seasonality_change'] = (
        df_grouped.groupby(['brand', 'family'])['mode_seasonality']
        .apply(lambda x: x.ne(x.shift()).astype(int))
        .reset_index(drop=True)
    )
    return df_grouped

def feature_engineering_pipeline(df, frequency='D'):
    """
    Generalized feature engineering pipeline for both daily and weekly models.
    """
    df_grouped = aggregate_data(df, frequency)
    df_store_features = compute_store_features(df, frequency)
    df_seasonality_features = compute_seasonality_features(df, frequency)

    df_final = df_grouped \
        .merge(df_store_features, on=['brand', 'family', 'date'], how='left') \
        .merge(df_seasonality_features, on=['brand', 'family', 'date'], how='left')

    df_final = compute_seasonality_change(df_final)

    return df_final

In [58]:
df_daily = feature_engineering_pipeline(df, frequency='D')

In [29]:
df_grouped.head()

Unnamed: 0,brand,family,date,total_quantity,total_revenue,avg_pvp,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,FS,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.0,180.0,1,0.0,1.0,1.0,...,1.0,FS,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.0,130.0,1,0.0,1.0,1.0,...,1.0,N-A,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0
3,Columbia,Botas Urban,2023-12-29,1.0,130.0,130.0,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
4,Columbia,Botas Urban,2024-01-01,1.0,130.0,130.0,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [30]:
df_grouped.drop("avg_pvp", axis=1).equals(df_daily)

False

In [32]:
df_grouped.drop("avg_pvp", axis=1)

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,top_3_store_sales,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,FS,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,1,0.0,1.0,1.0,1.0,...,1.0,FS,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [40]:
df_daily

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,top_store_sales_ratio,seasonality_entropy,mode_seasonality,top_3_store_sales_ratio,seasonality_change
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,1,0.0,1.0,0.0,FS,1.0,1
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,1,0.0,1.0,0.0,FS,1.0,0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,1,0.0,1.0,0.0,N-A,1.0,1
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,1,0.0,1.0,0.0,N-A,1.0,0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,1,0.0,1.0,0.0,N-A,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,1,0.0,1.0,0.0,N-A,1.0,0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,1,0.0,1.0,0.0,N-A,1.0,0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,1,0.0,1.0,0.0,N-A,1.0,0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,1,0.0,1.0,0.0,N-A,1.0,0


In [44]:
set(df_grouped.drop("avg_pvp", axis=1).columns) - set(df_daily.columns)

{'seasonality_FS_ratio',
 'seasonality_FW_ratio',
 'seasonality_M_ratio',
 'seasonality_N-A_ratio',
 'seasonality_S_ratio',
 'seasonality_W_ratio',
 'top_3_store_sales_ratio'}

In [47]:
cols = list(set(df_daily.columns).intersection(set(df_grouped.drop("avg_pvp", axis=1).columns)))

In [48]:
df_grouped[cols].equals(df_daily[cols])

False

In [59]:
(df_grouped[cols]!=(df_daily[cols])).sum()

seasonality_change        677
mode_seasonality          501
avg_sales_per_store      1492
seasonality_entropy      1638
brand                       0
num_stores                  0
top_3_store_sales           0
date                        0
total_quantity              0
family                      0
total_revenue               0
top_store_sales             0
store_entropy            5241
top_store_sales_ratio       0
dtype: int64

In [56]:
df_grouped.drop("avg_pvp", axis=1)[df_grouped["store_entropy"]!=(df_daily["store_entropy"])]

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,top_3_store_sales,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
28,Columbia,Botas de montaña,2020-11-04,6.0,209.699997,2,0.450561,5.0,0.833333,6.0,...,3.000000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
34,Columbia,Botas de montaña,2020-11-26,3.0,103.900002,2,0.636514,2.0,0.666667,3.0,...,1.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
35,Columbia,Botas de montaña,2020-11-27,3.0,149.899994,2,0.636514,2.0,0.666667,3.0,...,1.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
41,Columbia,Botas de montaña,2020-12-09,2.0,180.000000,2,0.693147,1.0,0.500000,2.0,...,1.000000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
42,Columbia,Botas de montaña,2020-12-11,5.0,173.800003,2,0.500402,4.0,0.800000,5.0,...,2.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23595,adidas,Zapatillas de montaña,2024-08-28,3.0,210.000000,2,0.636514,2.0,0.666667,3.0,...,1.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
23596,adidas,Zapatillas de montaña,2024-08-29,5.0,440.000000,3,1.054920,2.0,0.400000,5.0,...,1.666667,N-A,0.500402,0,0.0,0.0,0.0,0.8,0.0,0.2
23597,adidas,Zapatillas de montaña,2024-08-30,5.0,529.950012,2,0.500402,4.0,0.800000,5.0,...,2.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
23598,adidas,Zapatillas de montaña,2024-08-31,6.0,517.000000,4,1.242453,3.0,0.500000,5.0,...,1.500000,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0


In [55]:
df_daily[df_grouped["store_entropy"]!=(df_daily["store_entropy"])]

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,avg_sales_per_store,top_store_sales,top_3_store_sales,top_store_sales_ratio,seasonality_entropy,mode_seasonality,seasonality_ratio_FS,seasonality_ratio_FW,seasonality_ratio_M,seasonality_ratio_N-A,seasonality_ratio_S,seasonality_ratio_W,seasonality_change
28,Columbia,Botas de montaña,2020-11-04,6.0,209.699997,2,0.693147,3.000000,5.0,6.0,0.833333,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
34,Columbia,Botas de montaña,2020-11-26,3.0,103.900002,2,0.693147,1.500000,2.0,3.0,0.666667,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
35,Columbia,Botas de montaña,2020-11-27,3.0,149.899994,2,0.693147,1.500000,2.0,3.0,0.666667,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
41,Columbia,Botas de montaña,2020-12-09,2.0,180.000000,2,0.000000,1.000000,1.0,2.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
42,Columbia,Botas de montaña,2020-12-11,5.0,173.800003,2,0.693147,2.500000,4.0,5.0,0.800000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23595,adidas,Zapatillas de montaña,2024-08-28,3.0,210.000000,2,0.693147,1.500000,2.0,3.0,0.666667,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23596,adidas,Zapatillas de montaña,2024-08-29,5.0,440.000000,3,0.636514,1.666667,2.0,5.0,0.400000,0.693147,N-A,0.0,0.0,0.0,0.8,0.0,0.2,0
23597,adidas,Zapatillas de montaña,2024-08-30,5.0,529.950012,2,0.693147,2.500000,4.0,5.0,0.800000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23598,adidas,Zapatillas de montaña,2024-08-31,6.0,517.000000,4,0.562335,1.500000,3.0,5.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
