In [2]:
import numpy as np
import pandas as pd

print(f'numpy version: {np.__version__}')
print(f'pandas version: {pd.__version__}')

numpy version: 2.0.0
pandas version: 2.2.2


In [16]:
#sales material dataframe
skus = pd.DataFrame(
    data = {
        'material' : [12345, 12346, 12347, 12348, 12349, 12350],
        'brand' : ['heineken', 'heineken', 'amstel', 'amstel', 'devassa', 'schin'],
        'packtype' : ['RGB', 'OW', 'RGB', 'OW', 'OW', 'OW'],
        'segmento' : ['premium', 'premium', 'mainstream', 'mainstream', 'mainstream', 'economy']
    }
)
skus

Unnamed: 0,material,brand,packtype,segmento
0,12345,heineken,RGB,premium
1,12346,heineken,OW,premium
2,12347,amstel,RGB,mainstream
3,12348,amstel,OW,mainstream
4,12349,devassa,OW,mainstream
5,12350,schin,OW,economy


In [43]:
generator = np.random.default_rng(seed=123)
sales = pd.DataFrame(
    data = {
        'month' : pd.Series(generator.choice(['06 jun', '07 jul', '08 ago', '09 set', '10 out'], size=50), dtype='string'),
        'material' : generator.choice(skus['material'], size=50),
        'volume' : np.round(generator.uniform(low=0.001, high=10.000, size=50), 3)
    }
)
sales = sales.sort_values(by='month').reset_index(drop=True)
sales.head(10)

Unnamed: 0,month,material,volume
0,06 jun,12347,1.278
1,06 jun,12347,1.516
2,06 jun,12349,1.378
3,06 jun,12345,2.288
4,06 jun,12348,0.06
5,06 jun,12348,4.857
6,06 jun,12348,8.11
7,06 jun,12347,0.304
8,06 jun,12347,3.811
9,06 jun,12350,6.165


In [50]:
sales.info()
sales['volume'].describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   month     50 non-null     string 
 1   material  50 non-null     int64  
 2   volume    50 non-null     float64
dtypes: float64(1), int64(1), string(1)
memory usage: 1.3 KB


count    50.000000
mean      4.673380
std       2.714445
min       0.033000
25%       2.753750
50%       4.534500
75%       6.473750
max       9.505000
Name: volume, dtype: float64

In [68]:
#lets replicate the describe method
sales.loc[:,['volume']].agg(['count', 'mean', 'median', 'std', 'min', 'max', lambda c: c.quantile(0.25)])
sales.loc[:,['volume']].agg(
    count = ('volume', 'count'),
    mean = ('volume', 'mean'),
    std = ('volume', 'std'),
    min = ('volume', 'min'),
    quantile_25 = ('volume', lambda c: c.quantile(0.25)),
    quantile_50 = ('volume', lambda c: c.quantile(0.5)),
    quantile_75 = ('volume', lambda c: c.quantile(0.75)),
    max = ('volume', 'max'),
    sum = ('volume', 'sum')
)

Unnamed: 0,volume
count,50.0
mean,4.67338
std,2.714445
min,0.033
quantile_25,2.75375
quantile_50,4.5345
quantile_75,6.47375
max,9.505
sum,233.669


In [70]:
sales_full = pd.merge(left=sales, right=skus, how='left', on='material')
sales_bybrand = sales_full.groupby('brand')[['volume']].agg('sum')
sales_bybrand

Unnamed: 0_level_0,volume
brand,Unnamed: 1_level_1
amstel,77.141
devassa,28.896
heineken,80.26
schin,47.372
