# pandas - groupby
-  https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html

In [1]:
import pandas as pd
import numpy as np
from pydataset import data

In [2]:
mtcars = data('mtcars')

In [3]:
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [4]:
mtcars.dtypes

mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [5]:
#convert certain data to categories - gear, cyl, am, vs,carb
mtcars[['cyl', 'carb', 'am','vs', 'gear']] = mtcars[['cyl','carb','am','vs', 'gear']].astype('category')

In [6]:
mtcars.dtypes

mpg      float64
cyl     category
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs      category
am      category
gear    category
carb    category
dtype: object

In [9]:
#groupbys
mtcars.groupby('gear', as_index=False, observed=True).size()
#becoz we are using category data type column: there may be missing categories

Unnamed: 0,gear,size
0,3,15
1,4,12
2,5,5


In [10]:
mtcars.select_dtypes('number').mean()

mpg      20.090625
disp    230.721875
hp      146.687500
drat      3.596563
wt        3.217250
qsec     17.848750
dtype: float64

In [11]:
#aggregation: aggregate/ agg
mtcars.groupby('gear', as_index=False, observed=True).aggregate({'mpg':'mean'})

Unnamed: 0,gear,mpg
0,3,16.106667
1,4,24.533333
2,5,21.38


In [12]:
#import warnings
#warnings.filterwarnings('ignore')

In [14]:
mtcars.groupby('gear', as_index=False, observed=True).aggregate({'mpg':'mean','hp':'max'})

Unnamed: 0,gear,mpg,hp
0,3,16.106667,245
1,4,24.533333,123
2,5,21.38,335


In [16]:
mtcars.groupby('gear', observed=True).count()

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,carb
gear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,15,15,15,15,15,15,15,15,15,15
4,12,12,12,12,12,12,12,12,12,12
5,5,5,5,5,5,5,5,5,5,5


In [19]:
mtcars.groupby('gear', as_index=False, observed=True).agg(meanMPG = ('mpg','mean'), maxHP = ('hp','max'))

Unnamed: 0,gear,meanMPG,maxHP
0,3,16.106667,245
1,4,24.533333,123
2,5,21.38,335


In [22]:
#mtcars.groupby(['gear','am'], as_index=False).agg(meanMPG = ('mpg',np.mean), maxHP = ('hp',np.max))
mtcars.groupby(['gear','am'], observed=True).agg(meanMPG = ('mpg','mean'), maxHP = ('hp','max')).reset_index()

Unnamed: 0,gear,am,meanMPG,maxHP
0,3,0,16.106667,245
1,4,0,21.05,123
2,4,1,26.275,110
3,5,1,21.38,335


In [23]:
#gear cars which have mean mpg > 20
mtcars.groupby('gear', observed=True).filter(lambda x:x['mpg'].mean() > 20).sort_values('gear')

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1
Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [27]:
#names agg : control of name of agg col
aggMinMpg = pd.NamedAgg(column ='mpg', aggfunc='mean')
#mtcars.groupby('gear').agg(aggMinMpg)

In [28]:
mtcars.groupby('gear', observed=True).agg(MM = aggMinMpg).reset_index()

Unnamed: 0,gear,MM
0,3,16.106667
1,4,24.533333
2,5,21.38


In [29]:
myagg = {'minMPG' : ('mpg','min'), 'maxMPG' :('mpg', 'max'), 'meanHP':('hp','mean')}

In [31]:
mtcars.groupby('gear', observed=True).agg(**myagg).reset_index()

Unnamed: 0,gear,minMPG,maxMPG,meanHP
0,3,10.4,21.5,176.133333
1,4,17.8,33.9,89.5
2,5,15.0,30.4,195.6


In [33]:
mtcars.groupby('gear', observed=True)['mpg'].agg(['sum','count']).reset_index()
#only mpg col - sum, count

Unnamed: 0,gear,sum,count
0,3,241.6,15
1,4,294.4,12
2,5,106.9,5


In [40]:
mtcars.groupby('gear', observed=True).agg({'mpg':'mean', 'hp':'mean'})

Unnamed: 0_level_0,mpg,hp
gear,Unnamed: 1_level_1,Unnamed: 2_level_1
3,16.106667,176.133333
4,24.533333,89.5
5,21.38,195.6


In [42]:
mtcars.groupby('gear', observed=True).agg(meanMPG = pd.NamedAgg('mpg','mean'), meanHP = pd.NamedAgg('hp','mean'))

Unnamed: 0_level_0,meanMPG,meanHP
gear,Unnamed: 1_level_1,Unnamed: 2_level_1
3,16.106667,176.133333
4,24.533333,89.5
5,21.38,195.6


In [55]:
mtgpby2 = mtcars.groupby('gear',observed=True).agg({'mpg':['std','max', 'min'], 'hp':['min', 'max']})
mtgpby2

Unnamed: 0_level_0,mpg,mpg,mpg,hp,hp
Unnamed: 0_level_1,std,max,min,min,max
gear,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
3,3.371618,21.5,10.4,97,245
4,5.276764,33.9,17.8,52,123
5,6.658979,30.4,15.0,91,335


In [56]:
mtgpby2.reset_index()

Unnamed: 0_level_0,gear,mpg,mpg,mpg,hp,hp
Unnamed: 0_level_1,Unnamed: 1_level_1,std,max,min,min,max
0,3,3.371618,21.5,10.4,97,245
1,4,5.276764,33.9,17.8,52,123
2,5,6.658979,30.4,15.0,91,335


# summary stats
- https://pandas.pydata.org/docs/getting_started/intro_tutorials/06_calculate_statistics.html#min-tut-06-stats

In [58]:
# data - titanic
pytitanic = data('titanic')
pytitanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [59]:
import seaborn as sns
snstitanic = sns.load_dataset('titanic')
snstitanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [61]:
type(snstitanic)

pandas.core.frame.DataFrame

In [64]:
snstitanic['age'].mean()

29.69911764705882

In [67]:
snstitanic[['age','fare']].median()

age     28.0000
fare    14.4542
dtype: float64

In [69]:
snstitanic[['age','fare']].describe()

Unnamed: 0,age,fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


In [71]:
snstitanic.agg(
    {
        "age": ["min", "max", "median", "skew"],
        "fare": ["min", "max", "median", "mean"],
    }
)

Unnamed: 0,age,fare
min,0.42,0.0
max,80.0,512.3292
median,28.0,14.4542
skew,0.389108,
mean,,32.204208


In [72]:
snstitanic[['sex','age']].groupby('sex').mean()

Unnamed: 0_level_0,age
sex,Unnamed: 1_level_1
female,27.915709
male,30.726645


# split - apply function - combine