In [1]:
import numpy as np
import pandas as pd

pd.set_option('precision', 2)

In [2]:
dat = pd.read_csv('titanic.csv')

In [3]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 8 columns):
survived            887 non-null int64
pclass              887 non-null int64
name                887 non-null object
sex                 887 non-null object
age                 887 non-null float64
siblings_spouses    887 non-null int64
parents_children    887 non-null int64
fare                887 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 55.6+ KB


In [4]:
dat.head()

Unnamed: 0,survived,pclass,name,sex,age,siblings_spouses,parents_children,fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.28
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.92
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [5]:
grp = dat.groupby('pclass')

## Split

Conceptually, allows iteration over the split `DataFrame`

In [6]:
for cls_name, cls_df in grp:
    print(f'class={cls_name}')
    break # stop the iteration

class=1


In [7]:
cls_df.head()

Unnamed: 0,survived,pclass,name,sex,age,siblings_spouses,parents_children,fare
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.28
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
6,0,1,Mr. Timothy J McCarthy,male,54.0,0,0,51.86
11,1,1,Miss. Elizabeth Bonnell,female,58.0,0,0,26.55
23,1,1,Mr. William Thompson Sloper,male,28.0,0,0,35.5


# Apply

Default applies to all non-numeric columns.

In [None]:
grp.mean()

## Single column apply

Apply to a single column.

In [None]:
# Returns a Series
grp['age'].mean()

In [None]:
# Returns a DataFrame
grp[['age']].mean()

## One function, multiple columns

Apply the same function to multiple *selected* columns.

In [None]:
grp[['age', 'fare']].mean()

## Multiple functions, one or more columns

Apply different functions to a single column and give the
result `DataFrame` custom names.

Use `agg` method (short for `aggregate`)

Apply different functions to a single column. Results have the same
names as the functions.

In [None]:
grp['age'].agg([np.mean, np.median, 'std'])

## Multiple functions, multiple columns

Again, use the `agg` method.

In [None]:
grp.agg(
    mean_age=('age', lambda x: x.mean()),
    mean_fare=('fare', np.std)
)

### Pandas < 0.25

In [None]:
grp.agg({
    'age': lambda x: x.mean(),
    'fare': np.std
}).rename(columns={
    'age': 'mean_age',
    'fare': 'mean_fare'
})

## Grouping by Multiple Variables

Same idea as before, except our results now have a MultiIndex.

In [None]:
grp2 = dat.groupby(['pclass', 'sex'])
grp2[['age', 'fare']].mean()

## Flexible Apply

Use `apply` to operate on each grouped subset of the `DataFrame`

In [None]:
grp3 = dat.groupby('pclass')
grp3.apply(lambda df: pd.Series(df.shape, index=['nrow', 'ncol']))