# 10.1 GroupBy Mechanics

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': ['a','a','b','b','a','a'],
    'key2': ['one','two','one','two','three','one'],
    'd1': np.random.randn(6),
    'd2': np.random.randn(6)
})
df

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.314729,0.926007
1,a,two,1.281365,-0.614541
2,b,one,0.017899,-0.274321
3,b,two,0.88721,-0.692912
4,a,three,-0.905074,-2.547363
5,a,one,1.500063,0.52222


In [3]:
grouped = df['d1'].groupby(df['key1'])
grouped1 = df['d2'].groupby(df['key2'])
grouped2 = df['d1'].groupby([df['key1'], df['key2']]).mean()
grouped4 = df.groupby(df['key1'])


In [4]:
grouped.mean()

key1
a    0.390406
b    0.452555
Name: d1, dtype: float64

In [5]:
grouped1.mean()

key2
one      0.391302
three   -2.547363
two     -0.653727
Name: d2, dtype: float64

In [6]:
grouped2

key1  key2 
a     one      0.592667
      three   -0.905074
      two      1.281365
b     one      0.017899
      two      0.887210
Name: d1, dtype: float64

In [8]:
grouped4.mean()

Unnamed: 0_level_0,d1,d2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.390406,-0.428419
b,0.452555,-0.483617


In [7]:
print(df)
print('\n')
print('\n')
for (k1,k2), group in df.groupby(['key1','key2']):
#     print(k1)
#     print(k2)
    print(group)

  key1   key2        d1        d2
0    a    one -0.314729  0.926007
1    a    two  1.281365 -0.614541
2    b    one  0.017899 -0.274321
3    b    two  0.887210 -0.692912
4    a  three -0.905074 -2.547363
5    a    one  1.500063  0.522220




  key1 key2        d1        d2
0    a  one -0.314729  0.926007
5    a  one  1.500063  0.522220
  key1   key2        d1        d2
4    a  three -0.905074 -2.547363
  key1 key2        d1        d2
1    a  two  1.281365 -0.614541
  key1 key2        d1        d2
2    b  one  0.017899 -0.274321
  key1 key2       d1        d2
3    b  two  0.88721 -0.692912


In [9]:
pieces = dict(list(df.groupby('key1')))

In [14]:
pieces['a']

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.314729,0.926007
1,a,two,1.281365,-0.614541
4,a,three,-0.905074,-2.547363
5,a,one,1.500063,0.52222


In [12]:
pieces['b']

Unnamed: 0,key1,key2,d1,d2
2,b,one,0.017899,-0.274321
3,b,two,0.88721,-0.692912


In [15]:
# we could group the columns of our example df here by dtype

In [17]:
df.dtypes

key1     object
key2     object
d1      float64
d2      float64
dtype: object

In [21]:
groupByType = df.groupby(df.dtypes, axis=1)

In [23]:
groupByType

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fe110b8df98>

In [26]:
for dtype, group in groupByType:
    print(dtype)
    print(group)    

float64
         d1        d2
0 -0.314729  0.926007
1  1.281365 -0.614541
2  0.017899 -0.274321
3  0.887210 -0.692912
4 -0.905074 -2.547363
5  1.500063  0.522220
object
  key1   key2
0    a    one
1    a    two
2    b    one
3    b    two
4    a  three
5    a    one


In [28]:
df.groupby('key1')['d1'].mean()

key1
a    0.390406
b    0.452555
Name: d1, dtype: float64

In [31]:
df.groupby(['key1','key2'])['d1','d2'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,d1,d2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.592667,0.724114
a,three,-0.905074,-2.547363
a,two,1.281365,-0.614541
b,one,0.017899,-0.274321
b,two,0.88721,-0.692912


In [32]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,d1,d2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.592667,0.724114
a,three,-0.905074,-2.547363
a,two,1.281365,-0.614541
b,one,0.017899,-0.274321
b,two,0.88721,-0.692912


In [40]:
df

Unnamed: 0,key1,key2,d1,d2
0,a,one,-0.314729,0.926007
1,a,two,1.281365,-0.614541
2,b,one,0.017899,-0.274321
3,b,two,0.88721,-0.692912
4,a,three,-0.905074,-2.547363
5,a,one,1.500063,0.52222


In [55]:
#2:3 can be changed to [2]-> which rows to be taken
#[0,1,3,2] which columns to take into account

df.iloc[2:3,[0,1,2,3]]

Unnamed: 0,key1,key2,d1,d2
2,b,one,0.017899,-0.274321


In [58]:
df.iloc[0:2,1:2]

Unnamed: 0,key2
0,one
1,two


In [59]:
df.iloc[0:2,[1,2]]

Unnamed: 0,key2,d1
0,one,-0.314729
1,two,1.281365


In [60]:
#Grouping with Dicts and Series

In [69]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['J','B','K','L','T'])

In [70]:
people

Unnamed: 0,a,b,c,d,e
J,-2.802772,1.261315,0.548142,-0.165449,-0.189826
B,0.440735,-1.969659,-1.33168,-0.686631,-0.742898
K,-0.927421,1.248178,0.351582,0.302856,-0.968317
L,1.9784,0.706695,0.111392,0.626279,0.417228
T,-0.333,2.493726,-0.224362,-0.589751,0.043003


In [71]:
people.iloc[2:3,[1,2]]=np.nan

In [72]:
people

Unnamed: 0,a,b,c,d,e
J,-2.802772,1.261315,0.548142,-0.165449,-0.189826
B,0.440735,-1.969659,-1.33168,-0.686631,-0.742898
K,-0.927421,,,0.302856,-0.968317
L,1.9784,0.706695,0.111392,0.626279,0.417228
T,-0.333,2.493726,-0.224362,-0.589751,0.043003


In [73]:
# Now, suppose I have a group correspondence for the columns and want to sum
# together the columns by group

In [74]:
mapping={'a':'red','b':'green','c':'blue','d':'yellow','e':'orange','f':'maroon'}

In [75]:
mapping

{'a': 'red',
 'b': 'green',
 'c': 'blue',
 'd': 'yellow',
 'e': 'orange',
 'f': 'maroon'}

In [78]:
# you could construct an array from this dict to pass to groupby , but instead we
# can just pass the dict (I included the key 'f' to highlight that unused grouping keys
# are OK)
byColumn = people.groupby(mapping, axis=1)


In [87]:
byColumn.sum()

Unnamed: 0,blue,green,orange,red,yellow
J,0.548142,1.261315,-0.189826,-2.802772,-0.165449
B,-1.33168,-1.969659,-0.742898,0.440735,-0.686631
K,0.0,0.0,-0.968317,-0.927421,0.302856
L,0.111392,0.706695,0.417228,1.9784,0.626279
T,-0.224362,2.493726,0.043003,-0.333,-0.589751


In [82]:
map_series = pd.Series(mapping)

In [83]:
map_series

a       red
b     green
c      blue
d    yellow
e    orange
f    maroon
dtype: object

In [84]:
bySeries = people.groupby(map_series, axis=1)

In [86]:
bySeries.sum()

Unnamed: 0,blue,green,orange,red,yellow
J,0.548142,1.261315,-0.189826,-2.802772,-0.165449
B,-1.33168,-1.969659,-0.742898,0.440735,-0.686631
K,0.0,0.0,-0.968317,-0.927421,0.302856
L,0.111392,0.706695,0.417228,1.9784,0.626279
T,-0.224362,2.493726,0.043003,-0.333,-0.589751
