In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import entropy

import utils.utils_features_stores as utils_features_stores
import utils.utils_features_seasonality as utils_features_seasonality

In [2]:
cols_numerical = ["quantity", "pvp"]
cols_categoprical = ["product_id", "store", "seasonality", "brand", "family"]
cols_date = ["date"]

In [3]:
df_sales = pd.read_parquet("data/sales.parquet")
df_products = pd.read_parquet("data/products.parquet")
df = df_sales.merge(df_products, on="product_id")

In [4]:
df.head()

Unnamed: 0,product_id,fecha,store,quantity,seasonality,brand,family,pvp
0,883A4A2507184C5483FDC9FC7E48240B,2021-03-22,FR,1.0,N-A,adidas,Zapatillas Trail Running,130.0
1,883A4A2507184C5483FDC9FC7E48240B,2021-03-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
2,883A4A2507184C5483FDC9FC7E48240B,2021-08-27,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
3,883A4A2507184C5483FDC9FC7E48240B,2021-06-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.0
4,883A4A2507184C5483FDC9FC7E48240B,2021-03-05,IT,1.0,N-A,adidas,Zapatillas Trail Running,130.0


In [5]:
# Rename columns
df = df.rename(columns={"fecha": "date"})

# Types
# for col in cols_categoprical:
#     df[col] = df[col].astype("category")
#     df[col] = df[col].cat.remove_unused_categories()
for col in cols_numerical:
    df[col] = df[col].astype("float32")
for col in cols_date:
    df[col] = pd.to_datetime(df[col])

In [6]:
df

Unnamed: 0,product_id,date,store,quantity,seasonality,brand,family,pvp
0,883A4A2507184C5483FDC9FC7E48240B,2021-03-22,FR,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
1,883A4A2507184C5483FDC9FC7E48240B,2021-03-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
2,883A4A2507184C5483FDC9FC7E48240B,2021-08-27,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
3,883A4A2507184C5483FDC9FC7E48240B,2021-06-24,ES,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
4,883A4A2507184C5483FDC9FC7E48240B,2021-03-05,IT,1.0,N-A,adidas,Zapatillas Trail Running,130.000000
...,...,...,...,...,...,...,...,...
62809,F438E725631D4B72819E4B2E567B7C14,2023-03-29,ES,1.0,N-A,Columbia,Zapatillas de montaña,99.949997
62810,70732AD67F17451387FC89F1E576A525,2021-07-05,ES,1.0,N-A,Columbia,Zapatillas de montaña,80.000000
62811,622EF83B0C284654A91A8DBFD176471D,2021-07-10,FR,1.0,N-A,Columbia,Zapatillas de montaña,74.949997
62812,8349E74BA6BC4940A06DBD3088332A4C,2023-03-03,ES,1.0,N-A,Columbia,Zapatillas de montaña,35.000000


In [7]:
# Aggregate sales to Brand + Family Level
df_grouped = df.groupby(['brand', 'family', 'date']).agg(
    total_quantity=('quantity', 'sum'), 
    total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp'])) # Weighted revenue
).reset_index()

# Compute average weighted PVP per brand+family
df_grouped['avg_pvp'] = df_grouped['total_revenue'] / df_grouped['total_quantity']

In [8]:
# Store related features
df_grouped = utils_features_stores.add_num_stores(df, df_grouped)
df_grouped = utils_features_stores.add_store_sales_concentration(df, df_grouped)
df_grouped = utils_features_stores.add_top_store_sales_ratio(df, df_grouped)
df_grouped = utils_features_stores.add_top_3_store_sales_ratio(df, df_grouped)
df_grouped = utils_features_stores.add_avg_sales_per_store(df_grouped)

# Seasonality related features
df_grouped = utils_features_seasonality.add_mode_seasonality(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_sales_concentration(df, df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_change(df_grouped)
df_grouped = utils_features_seasonality.add_seasonality_ratios(df, df_grouped)

In [9]:
df_grouped

Unnamed: 0,brand,family,date,total_quantity,total_revenue,avg_pvp,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,FS,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,180.000000,1,0.0,1.0,1.0,...,1.0,FS,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,130.000000,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,99.949997,1,0.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [37]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

def aggregate_data(df, frequency='D'):
    """Aggregates total quantity and revenue at the specified time frequency."""
    df['date'] = pd.to_datetime(df['date'])
    
    df_grouped = df.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        total_quantity=('quantity', 'sum'),
        total_revenue=('quantity', lambda x: np.sum(x * df.loc[x.index, 'pvp']))
    ).reset_index()

    return df_grouped

def compute_store_features(df, frequency='D'):
    """Computes store-related features including store concentration and entropy."""
    df_store_features = df.groupby(['brand', 'family', 'date', 'store']).agg(
        store_sales=('quantity', 'sum')
    ).reset_index()

    df_store_features['date'] = pd.to_datetime(df_store_features['date'])

    df_store_grouped = df_store_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        num_stores=('store', 'nunique'),
        store_entropy=('store_sales', lambda x: entropy(x / x.sum()) if len(x) > 1 else 0),
        #store_entropy=('store_sales', lambda x: entropy(x.value_counts(normalize=True)) if len(x) > 1 else 0),
        avg_sales_per_store=('store_sales', 'mean'),
        top_store_sales=('store_sales', 'max'),
        top_3_store_sales=('store_sales', lambda x: x.nlargest(3).sum()),
        top_store_sales_ratio=('store_sales', lambda x: x.max() / x.sum() if x.sum() > 0 else 0)
    ).reset_index()

    return df_store_grouped

def compute_seasonality_features(df, frequency='D'):
    """Computes seasonality-related features including entropy and mode seasonality."""
    df_seasonality_features = df.groupby(['brand', 'family', 'date', 'seasonality']).agg(
        season_sales=('quantity', 'sum')
    ).reset_index()

    df_seasonality_features['date'] = pd.to_datetime(df_seasonality_features['date'])

    # Compute seasonality ratios
    df_pivot = df_seasonality_features.pivot_table(
        index=['brand', 'family', 'date'],
        columns='seasonality',
        values='season_sales',
        aggfunc='sum',
        fill_value=0
    )

    df_pivot = df_pivot.div(df_pivot.sum(axis=1), axis=0).fillna(0)
    df_pivot.columns = [f'seasonality_ratio_{col}' for col in df_pivot.columns]

    # Function to safely determine mode_seasonality
    def most_frequent_seasonality(group):
        """Returns the seasonality with the highest total sales."""
        counts = group.groupby('seasonality')['season_sales'].sum()
        return counts.idxmax() if len(counts) > 0 else 'N-A'  # Get seasonality with highest sales

    # Aggregate by the specified frequency
    df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
        seasonality_entropy=('season_sales', lambda x: entropy(x / x.sum()) if x.sum() > 0 and len(x) > 1 else 0)
    ).reset_index()

    # Compute mode_seasonality separately using `apply`
    mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(
        most_frequent_seasonality
    ).reset_index(name='mode_seasonality')

    df_seasonality_grouped = df_seasonality_grouped.merge(
        mode_seasonality_df, on=['brand', 'family', 'date'], how='left'
    ).merge(df_pivot.reset_index(), on=['brand', 'family', 'date'], how='left')

    return df_seasonality_grouped

    # df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
    #     seasonality_entropy=('season_sales', lambda x: entropy(x / x.sum()) if x.sum() > 0 and len(x) > 1 else 0),
    #     mode_seasonality=('seasonality', lambda x: x.loc[x.index[x.groupby(x).transform('sum').idxmax()]] if not x.empty else 'N-A')
    # ).reset_index()

    # df_seasonality_grouped = df_seasonality_features.groupby(['brand', 'family', pd.Grouper(key='date', freq=frequency)]).agg(
    #     seasonality_entropy=('season_sales', lambda x: entropy(x / x.sum()) if len(x) > 1 else 0),
    #     mode_seasonality=('seasonality', lambda x: x.mode()[0] if not x.mode().empty else 'N-A')
    # ).reset_index()

    # return df_seasonality_grouped.merge(df_pivot.reset_index(), on=['brand', 'family', 'date'], how='left')

def compute_seasonality_change(df_grouped):
    """Computes how frequently seasonality changes over time."""
    df_grouped['seasonality_change'] = (
        df_grouped.groupby(['brand', 'family'])['mode_seasonality']
        .apply(lambda x: x.ne(x.shift()).astype(int))
        .reset_index(drop=True)
    )
    return df_grouped

def feature_engineering_pipeline(df, frequency='D'):
    """
    Generalized feature engineering pipeline for both daily and weekly models.
    """
    df_grouped = aggregate_data(df, frequency)
    df_store_features = compute_store_features(df, frequency)
    df_seasonality_features = compute_seasonality_features(df, frequency)

    df_final = df_grouped \
        .merge(df_store_features, on=['brand', 'family', 'date'], how='left') \
        .merge(df_seasonality_features, on=['brand', 'family', 'date'], how='left')

    df_final = compute_seasonality_change(df_final)

    return df_final

In [38]:
df_daily = feature_engineering_pipeline(df, frequency='D')

  mode_seasonality_df = df_seasonality_features.groupby(['brand', 'family', 'date']).apply(


In [46]:
df_daily["store_entropy"] = round(df_daily["store_entropy"], 2)
df_grouped["store_entropy"] = round(df_grouped["store_entropy"], 2)

df_daily["avg_sales_per_store"] = round(df_daily["avg_sales_per_store"], 2)
df_grouped["avg_sales_per_store"] = round(df_grouped["avg_sales_per_store"], 2)

In [13]:
df_grouped = df_grouped.drop("avg_pvp", axis=1)

In [14]:
df_grouped

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,top_3_store_sales,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,FS,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,1,0.0,1.0,1.0,1.0,...,1.0,FS,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,1,0.0,0.0,0.0,1.0,0.0,0.0
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,1,0.0,1.0,1.0,1.0,...,1.0,N-A,0.0,0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
df_daily

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,avg_sales_per_store,top_store_sales,top_3_store_sales,top_store_sales_ratio,seasonality_entropy,mode_seasonality,seasonality_ratio_FS,seasonality_ratio_FW,seasonality_ratio_M,seasonality_ratio_N-A,seasonality_ratio_S,seasonality_ratio_W,seasonality_change
0,Columbia,Bañadores Natación,2021-09-14,1.0,99.949997,1,0.0,1.0,1.0,1.0,1.0,0.0,FS,1.0,0.0,0.0,0.0,0.0,0.0,1
1,Columbia,Bañadores Natación,2024-07-02,1.0,180.000000,1,0.0,1.0,1.0,1.0,1.0,0.0,FS,1.0,0.0,0.0,0.0,0.0,0.0,0
2,Columbia,Botas Urban,2023-12-03,1.0,130.000000,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,1
3,Columbia,Botas Urban,2023-12-29,1.0,130.000000,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
4,Columbia,Botas Urban,2024-01-01,1.0,130.000000,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23619,adidas,Zuecos Urban,2023-11-10,1.0,99.949997,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23620,adidas,Zuecos Urban,2024-05-24,1.0,99.949997,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23621,adidas,Zuecos Urban,2024-05-30,1.0,99.949997,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23622,adidas,Zuecos Urban,2024-06-12,1.0,99.949997,1,0.0,1.0,1.0,1.0,1.0,0.0,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0


In [16]:
df[(df["date"] == pd.to_datetime("2020-11-04")) & (df["brand"] == "Columbia") & (df["family"] == "Botas de montaña")]

Unnamed: 0,product_id,date,store,quantity,seasonality,brand,family,pvp
52904,EC45F7C2686A41D894863717DADD37B9,2020-11-04,ES,1.0,N-A,Columbia,Botas de montaña,34.950001
52907,EC45F7C2686A41D894863717DADD37B9,2020-11-04,ES,1.0,N-A,Columbia,Botas de montaña,34.950001
52908,EC45F7C2686A41D894863717DADD37B9,2020-11-04,ES,1.0,N-A,Columbia,Botas de montaña,34.950001
52924,EC45F7C2686A41D894863717DADD37B9,2020-11-04,ES,1.0,N-A,Columbia,Botas de montaña,34.950001
52926,EC45F7C2686A41D894863717DADD37B9,2020-11-04,FR,1.0,N-A,Columbia,Botas de montaña,34.950001
52929,EC45F7C2686A41D894863717DADD37B9,2020-11-04,ES,1.0,N-A,Columbia,Botas de montaña,34.950001


In [17]:
df["date"].iloc[0] == pd.to_datetime("2021-03-22")

True

In [18]:
cols = list(set(df_daily.columns).intersection(set(df_grouped.columns)))

In [19]:
df_grouped[cols].equals(df_daily[cols])

False

In [50]:
(df_grouped[cols]!=(df_daily[cols])).sum()

mode_seasonality           13
store_entropy               0
total_quantity              0
seasonality_entropy         0
family                      0
top_3_store_sales           0
brand                       0
top_store_sales             0
total_revenue               0
num_stores                  0
seasonality_change         25
date                        0
top_store_sales_ratio       0
avg_sales_per_store      1492
dtype: int64

In [51]:
df_grouped[df_grouped["avg_sales_per_store"]!=(df_daily["avg_sales_per_store"])]

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,top_store_sales,top_store_sales_ratio,top_3_store_sales,...,avg_sales_per_store,mode_seasonality,seasonality_entropy,seasonality_change,seasonality_FS_ratio,seasonality_FW_ratio,seasonality_M_ratio,seasonality_N-A_ratio,seasonality_S_ratio,seasonality_W_ratio
261,Columbia,Botas de montaña,2022-11-26,4.0,489.950012,3,1.04,2.0,0.500000,4.0,...,1.33,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
292,Columbia,Botas de montaña,2023-02-06,5.0,924.950012,3,0.95,3.0,0.600000,5.0,...,1.67,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
319,Columbia,Botas de montaña,2023-04-04,4.0,204.850006,3,1.04,2.0,0.500000,4.0,...,1.33,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
349,Columbia,Botas de montaña,2023-06-05,4.0,324.950012,3,1.04,2.0,0.500000,4.0,...,1.33,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
395,Columbia,Botas de montaña,2023-10-19,4.0,419.950012,3,1.04,2.0,0.500000,4.0,...,1.33,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23574,adidas,Zapatillas de montaña,2024-08-06,11.0,1099.949951,3,0.76,8.0,0.727273,11.0,...,3.67,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
23586,adidas,Zapatillas de montaña,2024-08-19,14.0,1255.000000,5,1.22,8.0,0.571429,12.0,...,2.80,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
23590,adidas,Zapatillas de montaña,2024-08-23,4.0,429.950012,3,1.04,2.0,0.500000,4.0,...,1.33,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0
23594,adidas,Zapatillas de montaña,2024-08-27,5.0,649.900024,3,1.05,2.0,0.400000,5.0,...,1.67,N-A,0.000000,0,0.0,0.0,0.0,1.0,0.0,0.0


In [52]:
df_daily[df_grouped["avg_sales_per_store"]!=(df_daily["avg_sales_per_store"])]

Unnamed: 0,brand,family,date,total_quantity,total_revenue,num_stores,store_entropy,avg_sales_per_store,top_store_sales,top_3_store_sales,top_store_sales_ratio,seasonality_entropy,mode_seasonality,seasonality_ratio_FS,seasonality_ratio_FW,seasonality_ratio_M,seasonality_ratio_N-A,seasonality_ratio_S,seasonality_ratio_W,seasonality_change
261,Columbia,Botas de montaña,2022-11-26,4.0,489.950012,3,1.04,1.33,2.0,4.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
292,Columbia,Botas de montaña,2023-02-06,5.0,924.950012,3,0.95,1.67,3.0,5.0,0.600000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
319,Columbia,Botas de montaña,2023-04-04,4.0,204.850006,3,1.04,1.33,2.0,4.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
349,Columbia,Botas de montaña,2023-06-05,4.0,324.950012,3,1.04,1.33,2.0,4.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
395,Columbia,Botas de montaña,2023-10-19,4.0,419.950012,3,1.04,1.33,2.0,4.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23574,adidas,Zapatillas de montaña,2024-08-06,11.0,1099.949951,3,0.76,3.67,8.0,11.0,0.727273,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23586,adidas,Zapatillas de montaña,2024-08-19,14.0,1255.000000,5,1.22,2.80,8.0,12.0,0.571429,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23590,adidas,Zapatillas de montaña,2024-08-23,4.0,429.950012,3,1.04,1.33,2.0,4.0,0.500000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0
23594,adidas,Zapatillas de montaña,2024-08-27,5.0,649.900024,3,1.05,1.67,2.0,5.0,0.400000,0.000000,N-A,0.0,0.0,0.0,1.0,0.0,0.0,0


In [44]:
df[(df["brand"] == "Columbia") & (df["family"] == "Camisetas Outdoor") & (df["date"] == pd.to_datetime("2021-08-23"))]

Unnamed: 0,product_id,date,store,quantity,seasonality,brand,family,pvp
20811,E715FFB987B94EDBBEAC265CBA137AEC,2021-08-23,IT,1.0,N-A,Columbia,Camisetas Outdoor,109.949997
20819,E715FFB987B94EDBBEAC265CBA137AEC,2021-08-23,IT,1.0,N-A,Columbia,Camisetas Outdoor,109.949997
37002,BE65C86F56FA4A23A7A976E9271C806B,2021-08-23,FR,1.0,N-A,Columbia,Camisetas Outdoor,110.0
40197,3D77A000997740E396761579C26FE40E,2021-08-23,PT,1.0,N-A,Columbia,Camisetas Outdoor,99.989998


In [26]:
from scipy.stats import entropy

In [54]:
entropy([0.833, 0.167])

np.float64(0.451097288383559)

In [29]:
df[(df["brand"] == "Columbia") & (df["family"] == "Camisetas Outdoor") & (df["date"] == pd.to_datetime("2021-04-09"))].groupby(['brand', 'family', 'date', 'store']).agg(
        store_sales=('quantity', 'sum')
    ).reset_index()

Unnamed: 0,brand,family,date,store,store_sales
0,Columbia,Camisetas Outdoor,2021-04-09,ES,1.0
1,Columbia,Camisetas Outdoor,2021-04-09,FR,2.0


In [33]:
entropy([1/3,2/3])

np.float64(0.6365141682948128)

In [57]:
entropy([1, 1000])

np.float64(0.007900354757641078)