# Pandas - GroupBy

### Table of Contents

 - [Group By](#groupby)
 - [Aggregations](#aggregations)
 - [Multi Indexing](#multi-index)
 - [Transformations](#transformations)

In [1]:
from IPython.core.display import HTML
css = open('styles/style-table.css').read() + open('styles/style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

### Imports

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns

<a id="groupby" \>

### GroupBy

In [3]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
balance = [10.2, 84.3, 72.9, 27.1, 223.1]
department = ['A','A','B','B','B']

users = pd.DataFrame({'id': ids, 
                      'name': names, 
                      'bal':balance, 
                      'dept': department})
users.head()


Unnamed: 0,bal,dept,id,name
0,10.2,A,23,John
1,84.3,A,34,Matt
2,72.9,B,83,Sara
3,27.1,B,86,Jim
4,223.1,B,12,Ashley


In [4]:
# Change the columns order
users = users.reindex(columns=['name','id','dept','bal'])
users

Unnamed: 0,name,id,dept,bal
0,John,23,A,10.2
1,Matt,34,A,84.3
2,Sara,83,B,72.9
3,Jim,86,B,27.1
4,Ashley,12,B,223.1


In [5]:
# What is the total balance by department... group by dept and then sum the balance.
grp = users.groupby('dept')
grp['bal'].sum()

dept
A     94.5
B    323.1
Name: bal, dtype: float64

In [6]:
#Group by department and sum the total...but use only subset dataframe.
dept = users[['dept','bal']]
grps = dept.groupby('dept')
grps['bal'].sum()


dept
A     94.5
B    323.1
Name: bal, dtype: float64

In [7]:
# What do the groups look like.
print("\n Groups:{}".format(grps.groups))

grps.size() # returns series
grps.count() # returns data frame


 Groups:{'A': [0, 1], 'B': [2, 3, 4]}


Unnamed: 0_level_0,bal
dept,Unnamed: 1_level_1
A,2
B,3


In [8]:
# Dictionary where 
#  - `keys` are the groubpy column values 
#  - values are their coresponding `DataFrame` items.

for key, value in grps:
    print ("({}){}".format(key,value))
    print("\n")


(A)  dept   bal
0    A  10.2
1    A  84.3


(B)  dept    bal
2    B   72.9
3    B   27.1
4    B  223.1




<a id="aggregations" /a>

### Aggregations

In [9]:
grp1 = users.groupby('dept')
grp1.head()

Unnamed: 0,name,id,dept,bal
0,John,23,A,10.2
1,Matt,34,A,84.3
2,Sara,83,B,72.9
3,Jim,86,B,27.1
4,Ashley,12,B,223.1


#### Apply SUM() to group

In [10]:
grp1.sum() # returns dataframe
#grp1.sum()['bal']  # returns series
#grp1.sum()[['bal']] # returns data frame

Unnamed: 0_level_0,id,bal
dept,Unnamed: 1_level_1,Unnamed: 2_level_1
A,57,94.5
B,181,323.1


In [11]:
grp1.head()

Unnamed: 0,name,id,dept,bal
0,John,23,A,10.2
1,Matt,34,A,84.3
2,Sara,83,B,72.9
3,Jim,86,B,27.1
4,Ashley,12,B,223.1


#### Apply multiple functions (aggregations) to a group

In [12]:
# Apply aggregations 
grp1 = users.groupby('dept')

#desc = grp1['bal'].agg(['sum','count','mean','median','min','max','std','var', np.size])
desc = grp1.agg(['sum','count','mean','median','min','max','std','var', np.size])['bal']
desc


Unnamed: 0_level_0,sum,count,mean,median,min,max,std,var,size
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,94.5,2,47.25,47.25,10.2,84.3,52.396612,2745.405,2.0
B,323.1,3,107.7,72.9,27.1,223.1,102.52941,10512.28,3.0


#### Group by department and then apply multiple functions on the dept balance.

In [13]:
dept.groupby('dept')['bal'].agg(['sum','count'])
#dept['bal'].groupby(dept['dept']).agg(['sum','count'])

Unnamed: 0_level_0,sum,count
dept,Unnamed: 1_level_1,Unnamed: 2_level_1
A,94.5,2
B,323.1,3


<a id="multi-index" />

### Multi Indexing

In [14]:
# Multi Index
df = pd.DataFrame({'data1':[10,5,25,30,10],
                   'data2':[4,2,8,6,4],
                   'key1':['a','b','a','a','b'],
                   'key2':['x','x','x','z','z']})
df

Unnamed: 0,data1,data2,key1,key2
0,10,4,a,x
1,5,2,b,x
2,25,8,a,x
3,30,6,a,z
4,10,4,b,z


#### as_index=True --> Multi- index created based on keys

In [15]:

grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp2.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,x,35,12
a,z,30,6
b,x,5,2
b,z,10,4


#### Add prefix to column names

In [16]:

grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp2.sum().add_prefix("pre_")

Unnamed: 0_level_0,Unnamed: 1_level_0,pre_data1,pre_data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,x,35,12
a,z,30,6
b,x,5,2
b,z,10,4


#### multiple grouping functions & custom labels to resulting columns

In [17]:
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)

grp2.agg([('result1','mean'), ('result2', 'sum')])


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,result1,result2,result1,result2
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,x,17.5,35,6,12
a,z,30.0,30,6,6
b,x,5.0,5,2,2
b,z,10.0,10,4,4


#### Different functions for different columns

In [18]:
# different functions for different columns
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp3 = grp2.agg({'data1': 'mean', 'data2': 'sum'})
grp3


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,x,17.5,12
a,z,30.0,6
b,x,5.0,2
b,z,10.0,4


#### Reset Index 

In [19]:
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp2.sum().reset_index()

Unnamed: 0,key1,key2,data1,data2
0,a,x,35,12
1,a,z,30,6
2,b,x,5,2
3,b,z,10,4


#### Group by with as_index=FALSE

In [20]:
grp2 = df.groupby(['key1','key2'], sort=True, as_index=False)
grp2.sum()

Unnamed: 0,key1,key2,data1,data2
0,a,x,35,12
1,a,z,30,6
2,b,x,5,2
3,b,z,10,4


#### Group by with as_index=TRUE

In [21]:
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp2.size()


key1  key2
a     x       2
      z       1
b     x       1
      z       1
dtype: int64

#### Enumerating GroupBy Object

In [22]:
#using groupby object
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
for (k1, k2), groupdata in grp2:
    print(k1, k2)
    print(groupdata)
    

('a', 'x')
   data1  data2 key1 key2
0     10      4    a    x
2     25      8    a    x
('a', 'z')
   data1  data2 key1 key2
3     30      6    a    z
('b', 'x')
   data1  data2 key1 key2
1      5      2    b    x
('b', 'z')
   data1  data2 key1 key2
4     10      4    b    z


In [23]:
#retrieving specific groups
grp2 = df.groupby(['key1','key2'], sort=True, as_index=True)
grp2.groups


{('a', 'x'): [0, 2], ('a', 'z'): [3], ('b', 'x'): [1], ('b', 'z'): [4]}

<a id="transformations" />

### Transformations

In [24]:
df

Unnamed: 0,data1,data2,key1,key2
0,10,4,a,x
1,5,2,b,x
2,25,8,a,x
3,30,6,a,z
4,10,4,b,z


#### Re-arrange columns in a dataframe

In [25]:
grp2 = df.groupby(['key1','key2'])
df1 = grp2.get_group(('a','x'))
df1

cols = ['key1','key2','data1','data2']
df1 = df1[cols]
df1

Unnamed: 0,key1,key2,data1,data2
0,a,x,10,4
2,a,x,25,8


#### Re-arrange rows in a data frame

In [26]:

df1 = df.copy()
df1.reindex(index=df1.index[::-1])


Unnamed: 0,data1,data2,key1,key2
4,10,4,b,z
3,30,6,a,z
2,25,8,a,x
1,5,2,b,x
0,10,4,a,x


#### Sort the data frame by ROW index in a data frame

In [27]:
df1 = df.copy()
df1.sort_index()


Unnamed: 0,data1,data2,key1,key2
0,10,4,a,x
1,5,2,b,x
2,25,8,a,x
3,30,6,a,z
4,10,4,b,z


#### Sort the data frame by COLUMN values in a data frame

In [28]:
df1 = df.copy()
df1.sort_values(by='data2')


Unnamed: 0,data1,data2,key1,key2
1,5,2,b,x
0,10,4,a,x
4,10,4,b,z
3,30,6,a,z
2,25,8,a,x


### GroupBy with functions

In [29]:
grp2 = df.groupby(['key1','key2'])
grp2.transform('mean')

Unnamed: 0,data1,data2
0,17.5,6.0
1,5.0,2.0
2,17.5,6.0
3,30.0,6.0
4,10.0,4.0


In [30]:
grp2 = df.groupby(['key1','key2'])
print(grp2.head())

grp2['data1'].quantile(0.9)

   data1  data2 key1 key2
0     10      4    a    x
1      5      2    b    x
2     25      8    a    x
3     30      6    a    z
4     10      4    b    z


key1  key2
a     x       23.5
      z       30.0
b     x        5.0
      z       10.0
Name: data1, dtype: float64

In [31]:
grp2.transform(lambda x: (x - x.mean()) / x.std())

Unnamed: 0,data1,data2
0,-0.707107,-0.707107
1,,
2,0.707107,0.707107
3,,
4,,


In [32]:
grp2.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,x,count,2.0,2.0
a,x,mean,17.5,6.0
a,x,std,10.606602,2.828427
a,x,min,10.0,4.0
a,x,25%,13.75,5.0
a,x,50%,17.5,6.0
a,x,75%,21.25,7.0
a,x,max,25.0,8.0
a,z,count,1.0,1.0
a,z,mean,30.0,6.0


### Group By Index Levels

In [33]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)

print(hier_df)
hier_df.groupby(level='cty', axis=1).count()

cty          US                            JP          
tenor         1         3         5         1         3
0     -0.959323  1.091423 -0.225822 -1.311137 -0.561703
1      0.575470  0.335324  1.148607 -1.831557  1.209336
2      0.334796  0.280211 -1.570277 -1.183617 -0.101507
3     -0.625042 -0.303548  1.206250  1.630169  0.198739


cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
