In [1]:
import numpy as np
import pandas as pd

In [2]:
# group

In [3]:
df = pd.read_csv('data/learn_pandas.csv')
df.groupby('Gender')['Height'].median()

Gender
Female    159.6
Male      173.4
Name: Height, dtype: float64

In [4]:
df.groupby(['School', 'Gender'])['Height'].mean()  # multiple dimensions

School                         Gender
Fudan University               Female    158.776923
                               Male      174.212500
Peking University              Female    158.666667
                               Male      172.030000
Shanghai Jiao Tong University  Female    159.122500
                               Male      176.760000
Tsinghua University            Female    159.753333
                               Male      171.638889
Name: Height, dtype: float64

In [5]:
condition = df.Weight > df.Weight.mean()
df.groupby(condition)['Height'].mean()

Weight
False    159.034646
True     172.705357
Name: Height, dtype: float64

In [6]:
df[['School', 'Gender']]

Unnamed: 0,School,Gender
0,Shanghai Jiao Tong University,Female
1,Peking University,Male
2,Shanghai Jiao Tong University,Male
3,Fudan University,Female
4,Fudan University,Male
...,...,...
195,Fudan University,Female
196,Tsinghua University,Female
197,Shanghai Jiao Tong University,Female
198,Shanghai Jiao Tong University,Male


In [7]:
df[['School', 'Gender']].drop_duplicates()

Unnamed: 0,School,Gender
0,Shanghai Jiao Tong University,Female
1,Peking University,Male
2,Shanghai Jiao Tong University,Male
3,Fudan University,Female
4,Fudan University,Male
5,Tsinghua University,Female
9,Peking University,Female
16,Tsinghua University,Male


In [8]:
# groupby objects

In [9]:
gb = df.groupby(['School', 'Grade'])
gb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027092B8CAC8>

In [10]:
gb.ngroups

16

In [12]:
gb.groups.keys()

dict_keys([('Fudan University', 'Freshman'), ('Fudan University', 'Junior'), ('Fudan University', 'Senior'), ('Fudan University', 'Sophomore'), ('Peking University', 'Freshman'), ('Peking University', 'Junior'), ('Peking University', 'Senior'), ('Peking University', 'Sophomore'), ('Shanghai Jiao Tong University', 'Freshman'), ('Shanghai Jiao Tong University', 'Junior'), ('Shanghai Jiao Tong University', 'Senior'), ('Shanghai Jiao Tong University', 'Sophomore'), ('Tsinghua University', 'Freshman'), ('Tsinghua University', 'Junior'), ('Tsinghua University', 'Senior'), ('Tsinghua University', 'Sophomore')])

In [13]:
gb.size()

School                         Grade    
Fudan University               Freshman      9
                               Junior       12
                               Senior       11
                               Sophomore     8
Peking University              Freshman     13
                               Junior        8
                               Senior        8
                               Sophomore     5
Shanghai Jiao Tong University  Freshman     13
                               Junior       17
                               Senior       22
                               Sophomore     5
Tsinghua University            Freshman     17
                               Junior       22
                               Senior       14
                               Sophomore    16
dtype: int64

In [14]:
gb.get_group(('Fudan University', 'Freshman')).iloc[:3, :3]

Unnamed: 0,School,Grade,Name
15,Fudan University,Freshman,Changqiang Yang
28,Fudan University,Freshman,Gaoqiang Qin
63,Fudan University,Freshman,Gaofeng Zhao


In [15]:
# aggregate functions

In [16]:
gb = df.groupby('Gender')['Height']
gb.idxmin()

Gender
Female    143
Male      199
Name: Height, dtype: int64

In [17]:
gb.quantile(0.95)

Gender
Female    166.8
Male      185.9
Name: Height, dtype: float64

In [19]:
gb = df.groupby('Gender')[['Height', 'Weight']]
gb.max()

Unnamed: 0_level_0,Height,Weight
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,170.2,63.0
Male,193.9,89.0


In [20]:
gb.agg(['sum', 'idxmax', 'skew'])  # use multiple functions

Unnamed: 0_level_0,Height,Height,Height,Weight,Weight,Weight
Unnamed: 0_level_1,sum,idxmax,skew,sum,idxmax,skew
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,21014.0,28,-0.219253,6469.0,28,-0.268482
Male,8854.9,193,0.437535,3929.0,2,-0.332393


In [21]:
gb.agg({'Height':['mean','max'], 'Weight':'count'})

Unnamed: 0_level_0,Height,Height,Weight
Unnamed: 0_level_1,mean,max,count
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,159.19697,170.2,135
Male,173.62549,193.9,54


In [22]:
gb.agg([('range', lambda x: x.max()-x.min()), ('my_sum', 'sum')])  # rename the column in the sesult

Unnamed: 0_level_0,Height,Height,Weight,Weight
Unnamed: 0_level_1,range,my_sum,range,my_sum
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,24.8,21014.0,29.0,6469.0
Male,38.2,8854.9,38.0,3929.0


In [23]:
# transform

In [24]:
gb.cummax().head()

Unnamed: 0,Height,Weight
0,158.9,46.0
1,166.5,70.0
2,188.9,89.0
3,,46.0
4,188.9,89.0


In [25]:
gb.transform(lambda x: (x-x.mean())/x.std()).head()

Unnamed: 0,Height,Weight
0,-0.05876,-0.354888
1,-1.010925,-0.355
2,2.167063,2.089498
3,,-1.279789
4,0.053133,0.159631


In [26]:
gb.transform('mean').head()

Unnamed: 0,Height,Weight
0,159.19697,47.918519
1,173.62549,72.759259
2,173.62549,72.759259
3,159.19697,47.918519
4,173.62549,72.759259


In [27]:
gb.filter(lambda x: x.shape[0] > 100).head()

Unnamed: 0,Height,Weight
0,158.9,46.0
3,,41.0
5,158.0,51.0
6,162.5,52.0
7,161.9,50.0


In [28]:
# apply

In [29]:
def BMI(x):
    Height = x['Height']/100
    Weight = x['Weight']
    BMI_value = Weight/Height**2
    return BMI_value.mean()
gb.apply(BMI)

Gender
Female    18.860930
Male      24.318654
dtype: float64

In [30]:
gb = df.groupby(['Gender','Test_Number'])[['Height','Weight']]
gb.apply(lambda x: 0)

Gender  Test_Number
Female  1              0
        2              0
        3              0
Male    1              0
        2              0
        3              0
dtype: int64

In [31]:
gb.apply(lambda x: [0, 0])

Gender  Test_Number
Female  1              [0, 0]
        2              [0, 0]
        3              [0, 0]
Male    1              [0, 0]
        2              [0, 0]
        3              [0, 0]
dtype: object

In [32]:
gb.apply(lambda x: pd.Series([0,0],index=['a','b']))

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
Gender,Test_Number,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,1,0,0
Female,2,0,0
Female,3,0,0
Male,1,0,0
Male,2,0,0
Male,3,0,0


In [33]:
gb.apply(lambda x: pd.DataFrame(np.ones((2,2)), index = ['a','b'], columns=pd.Index([('w','x'),('y','z')])))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,w,y
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,x,z
Gender,Test_Number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,1,a,1.0,1.0
Female,1,b,1.0,1.0
Female,2,a,1.0,1.0
Female,2,b,1.0,1.0
Female,3,a,1.0,1.0
Female,3,b,1.0,1.0
Male,1,a,1.0,1.0
Male,1,b,1.0,1.0
Male,2,a,1.0,1.0
Male,2,b,1.0,1.0
