In [8]:
import pandas as pd
import numpy as np

from utils.utils_preprocessing import preprocess_data
from utils.utils_features import FeatureEngineeringPipeline

In [9]:
df_sales = pd.read_parquet("data/sales.parquet")
df_products = pd.read_parquet("data/products.parquet")
df = df_sales.merge(df_products, on="product_id")

In [10]:
df = preprocess_data(df)

In [11]:
pipeline = FeatureEngineeringPipeline(df, frequency='D')
df_daily = pipeline.run()

In [12]:
df_daily

Unnamed: 0,brand,family,date,total_quantity,total_revenue,avg_pvp,num_stores,store_sales_concentration,avg_sales_per_store,top_store_sales,...,top_store_sales_ratio,seasonality_sales_concentration,mode_seasonality,seasonality_ratio_FS,seasonality_ratio_FW,seasonality_ratio_M,seasonality_ratio_N-A,seasonality_ratio_S,seasonality_ratio_W,seasonality_change
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,0.0,FS,1.0,0.0,0.0,0.0,0.0,0.0,1
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,180.000000,1,0.0,1.0,1.0,...,1.0,0.0,FS,1.0,0.0,0.0,0.0,0.0,0.0,0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,1
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0


In [13]:
pipeline = FeatureEngineeringPipeline(df, frequency='W-SUN')
df_weekly = pipeline.run()

In [14]:
pipeline = FeatureEngineeringPipeline(df, frequency='ME')
df_monthly = pipeline.run()

In [7]:
# Aggregate sales to Brand + Family Level
df_grouped = df.groupby(['brand', 'family', 'date']).agg(
    total_quantity=('quantity', 'sum'), 
    total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp'])) # Weighted revenue
).reset_index()

# Compute average weighted PVP per brand+family
df_grouped['avg_pvp'] = df_grouped['total_revenue'] / df_grouped['total_quantity']

In [None]:
# Store related features
df_grouped = utils_features.add_num_stores(df, df_grouped)
df_grouped = utils_features.add_store_sales_concentration(df, df_grouped)
df_grouped = utils_features.add_top_store_sales_ratio(df, df_grouped)
df_grouped = utils_features.add_top_3_store_sales_ratio(df, df_grouped)
df_grouped = utils_features.add_avg_sales_per_store(df_grouped)

# Seasonality related features
df_grouped = utils_features_seasonality.add_mode_seasonality(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_sales_concentration(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_change(df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_ratios(df, df_grouped)

In [9]:
df_grouped

Unnamed: 0,brand,family,date,total_quantity,total_revenue,avg_pvp,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,FS,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,180.000000,1,0.0,1.0,1.0,...,1.0,FS,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

def aggregate_data(df, frequency='D'):
    """Aggregates total quantity and revenue at the specified time frequency."""
    df['date'] = pd.to_datetime(df['date'])
    
    df_grouped = df.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        total_quantity=('quantity', 'sum'),
        total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp']))
    ).reset_index()

    df_grouped['avg_pvp'] = df_grouped['total_revenue'] / df_grouped['total_quantity']

    return df_grouped

def compute_store_features(df, frequency='D'):
    """Computes store-related features including store concentration and entropy."""
    df_store_features = df.groupby(['brand', 'family', 'date', 'store']).agg(
        store_sales=('quantity', 'sum')
    ).reset_index()

    df_store_features['date'] = pd.to_datetime(df_store_features['date'])

    df_store_grouped = df_store_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        num_stores=('store', 'nunique'),
        store_entropy=('store_sales', lambda x: entropy(x / x.sum()) if len(x) > 1 else 0),
        avg_sales_per_store=('store_sales', 'mean'),
        top_store_sales=('store_sales', 'max'),
        top_3_store_sales=('store_sales', lambda x: x.nlargest(3).sum()),
        top_store_sales_ratio=('store_sales', lambda x: x.max() / x.sum() if x.sum() > 0 else 0)
    ).reset_index()
    df_store_grouped['avg_sales_per_store'] = df_store_grouped['avg_sales_per_store'].round(2)
    df_store_grouped['store_entropy'] = df_store_grouped['store_entropy'].round(2)

    return df_store_grouped

def compute_seasonality_features(df, frequency='D'):
    """Computes seasonality-related features including entropy and mode seasonality."""
    df_seasonality_features = df.groupby(['brand', 'family', 'date', 'seasonality']).agg(
        season_sales=('quantity', 'sum')
    ).reset_index()

    df_seasonality_features['date'] = pd.to_datetime(df_seasonality_features['date'])

    # Compute seasonality ratios
    df_pivot = df_seasonality_features.pivot_table(
        index=['brand', 'family', 'date'],
        columns='seasonality',
        values='season_sales',
        aggfunc='sum',
        fill_value=0
    )

    df_pivot = df_pivot.div(df_pivot.sum(axis=1), axis=0).fillna(0)
    df_pivot.columns = [f'seasonality_ratio_{col}' for col in df_pivot.columns]

    # Function to safely determine mode_seasonality
    def most_frequent_seasonality(group):
        """Returns the seasonality with the highest total sales."""
        counts = group.groupby('seasonality')['season_sales'].sum()
        return counts.idxmax() if len(counts) > 0 else 'N-A'  # Get seasonality with highest sales

    # Aggregate by the specified frequency
    df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        seasonality_entropy=('season_sales', lambda x: entropy(x / x.sum()) if x.sum() > 0 and len(x) > 1 else 0)
    ).reset_index()

    # Compute mode_seasonality separately using `apply`
    mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(
        most_frequent_seasonality
    ).reset_index(name='mode_seasonality')

    df_seasonality_grouped = df_seasonality_grouped.merge(
        mode_seasonality_df, on=['brand', 'family', 'date'], how='left'
    ).merge(df_pivot.reset_index(), on=['brand', 'family', 'date'], how='left')

    return df_seasonality_grouped

def compute_seasonality_change(df_grouped):
    """Computes how frequently seasonality changes over time."""
    df_grouped['seasonality_change'] = (
        df_grouped.groupby(['brand', 'family'])['mode_seasonality']
        .apply(lambda x: x.ne(x.shift()).astype(int))
        .reset_index(drop=True)
    )
    return df_grouped

def feature_engineering_pipeline(df, frequency='D'):
    """
    Generalized feature engineering pipeline for both daily and weekly models.
    """
    df_grouped = aggregate_data(df, frequency)
    df_store_features = compute_store_features(df, frequency)
    df_seasonality_features = compute_seasonality_features(df, frequency)

    df_final = df_grouped \
        .merge(df_store_features, on=['brand', 'family', 'date'], how='left') \
        .merge(df_seasonality_features, on=['brand', 'family', 'date'], how='left')

    df_final = compute_seasonality_change(df_final)

    return df_final

In [48]:
df_daily = feature_engineering_pipeline(df, frequency='D')

  mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(


In [49]:
df_weekly = feature_engineering_pipeline(df, frequency='W-SUN')

  mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(


In [50]:
df_monthly = feature_engineering_pipeline(df, frequency='M')

  df_grouped = df.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
  df_store_grouped = df_store_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
  df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
  mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(


In [53]:
df_monthly.set_index("date").sort_index().index.unique()

DatetimeIndex(['2020-09-30', '2020-10-31', '2020-11-30', '2020-12-31',
               '2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31', '2024-04-30',
               '2024-05-31', '2024-06-30', '2024-07-31', '2024-08-31',
               '2024-09-30'],
              dtype='datetime64[ns]', name='date', freq=None)