## CHAPTER 9
# Data Aggregation and Group Operations
---
## GroupBy

In [1]:
%pylab inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
    'key2' : ['one', 'two', 'one', 'two', 'one'],
    'data1' : np.random.randn(5),
    'data2' : np.random.randn(5)})
display(df)

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.307764,-0.313347
1,a,two,-0.496095,1.818239
2,b,one,0.876325,0.259446
3,b,two,0.078158,0.544423
4,a,one,-0.788753,0.345352


In [3]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.007639,0.616748
b,0.477242,0.401934


In [4]:
df.groupby('key2').mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0.465112,0.09715
two,-0.208968,1.181331


In [5]:
df.groupby('key1').data1.mean()
df.groupby('key1')['data1'].mean()
df.groupby('key1').mean().data1

key1
a    0.007639
b    0.477242
Name: data1, dtype: float64

In [6]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.259505,0.016002
a,two,-0.496095,1.818239
b,one,0.876325,0.259446
b,two,0.078158,0.544423


In [7]:
df.groupby(['key1','key2']).mean().data1

key1  key2
a     one     0.259505
      two    -0.496095
b     one     0.876325
      two     0.078158
Name: data1, dtype: float64

In [8]:
means = df.data1.groupby([df.key1,df.key2]).mean()
means

key1  key2
a     one     0.259505
      two    -0.496095
b     one     0.876325
      two     0.078158
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.259505,-0.496095
b,0.876325,0.078158


In [10]:
means = df.groupby([df.key1,df.key2]).mean()
means

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.259505,0.016002
a,two,-0.496095,1.818239
b,one,0.876325,0.259446
b,two,0.078158,0.544423


In [11]:
means.unstack()

Unnamed: 0_level_0,data1,data1,data2,data2
key2,one,two,one,two
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,0.259505,-0.496095,0.016002,1.818239
b,0.876325,0.078158,0.259446,0.544423


In [12]:
means.unstack(level=0)

Unnamed: 0_level_0,data1,data1,data2,data2
key1,a,b,a,b
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0.259505,0.876325,0.016002,0.259446
two,-0.496095,0.078158,1.818239,0.544423


In [13]:
means.reset_index()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.259505,0.016002
1,a,two,-0.496095,1.818239
2,b,one,0.876325,0.259446
3,b,two,0.078158,0.544423


In [14]:
df.set_index(['key1','key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.307764,-0.313347
a,two,-0.496095,1.818239
b,one,0.876325,0.259446
b,two,0.078158,0.544423
a,one,-0.788753,0.345352


In [15]:
df.set_index(['key1','key2']).reset_index()

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.307764,-0.313347
1,a,two,-0.496095,1.818239
2,b,one,0.876325,0.259446
3,b,two,0.078158,0.544423
4,a,one,-0.788753,0.345352


In [16]:
# help(df.reindex)
df.set_index(['key1','key2']).reset_index().reindex(columns=['data1','data2','key1','key2'])

Unnamed: 0,data1,data2,key1,key2
0,1.307764,-0.313347,a,one
1,-0.496095,1.818239,a,two
2,0.876325,0.259446,b,one
3,0.078158,0.544423,b,two
4,-0.788753,0.345352,a,one


In [17]:
df2=df.copy()
df2.iloc[0,0]=None
display(df2)
df2.groupby(['key1','key2']).count()

Unnamed: 0,key1,key2,data1,data2
0,,one,1.307764,-0.313347
1,a,two,-0.496095,1.818239
2,b,one,0.876325,0.259446
3,b,two,0.078158,0.544423
4,a,one,-0.788753,0.345352


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1,1
a,two,1,1
b,one,1,1
b,two,1,1


In [18]:
df2.groupby(['key1','key2']).size()

key1  key2
a     one     1
      two     1
b     one     1
      two     1
dtype: int64

In [19]:
for key, d in df.groupby(['key1','key2']):
    print(key)
    print(d)
    print('')

('a', 'one')
  key1 key2     data1     data2
0    a  one  1.307764 -0.313347
4    a  one -0.788753  0.345352

('a', 'two')
  key1 key2     data1     data2
1    a  two -0.496095  1.818239

('b', 'one')
  key1 key2     data1     data2
2    b  one  0.876325  0.259446

('b', 'two')
  key1 key2     data1     data2
3    b  two  0.078158  0.544423



In [20]:
pieces = dict(list(df.groupby(['key1','key2'])))
pieces[('a','one')]

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.307764,-0.313347
4,a,one,-0.788753,0.345352


In [21]:
display(df)

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.307764,-0.313347
1,a,two,-0.496095,1.818239
2,b,one,0.876325,0.259446
3,b,two,0.078158,0.544423
4,a,one,-0.788753,0.345352


In [22]:
list(df.groupby('key1').data1)

[('a', 0    1.307764
  1   -0.496095
  4   -0.788753
  Name: data1, dtype: float64), ('b', 2    0.876325
  3    0.078158
  Name: data1, dtype: float64)]

In [23]:
people = pd.DataFrame(np.random.randn(5, 5),
    columns=['a', 'b', 'c', 'd', 'e'],
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

display(people)

Unnamed: 0,a,b,c,d,e
Joe,1.548445,-0.443356,-0.437886,-0.976216,-0.646307
Steve,0.735658,0.539993,0.605227,-0.585851,1.455782
Wes,1.697668,0.813727,0.310437,-0.898215,-0.610357
Jim,0.166031,-0.700706,2.019843,-0.587366,-0.400785
Travis,0.01883,1.248799,-1.789377,-0.239658,-0.348106


In [24]:
# people.iloc[2,[1,2]] = None
# people.loc['Wes',['b','c']]=None
people.iloc[2][['b','c']]=None # np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,1.548445,-0.443356,-0.437886,-0.976216,-0.646307
Steve,0.735658,0.539993,0.605227,-0.585851,1.455782
Wes,1.697668,,,-0.898215,-0.610357
Jim,0.166031,-0.700706,2.019843,-0.587366,-0.400785
Travis,0.01883,1.248799,-1.789377,-0.239658,-0.348106


In [29]:
# mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
mapping = ['red','red','blue','blue','red']
# list(people.groupby(mapping, axis=1))
people.groupby(mapping).count()

Unnamed: 0,a,b,c,d,e
blue,2,1,1,2,2
red,3,3,3,3,3


In [30]:
people.groupby(len).size()

3    3
5    1
6    1
dtype: int64

In [26]:
list(people.groupby(len)) # by index name length

[(3,             a         b         c         d         e
  Joe  1.548445 -0.443356 -0.437886 -0.976216 -0.646307
  Wes  1.697668       NaN       NaN -0.898215 -0.610357
  Jim  0.166031 -0.700706  2.019843 -0.587366 -0.400785),
 (5,               a         b         c         d         e
  Steve  0.735658  0.539993  0.605227 -0.585851  1.455782),
 (6,               a         b         c         d         e
  Travis  0.01883  1.248799 -1.789377 -0.239658 -0.348106)]

In [27]:
people.groupby(people.a>0).mean().rename({True:1,False:-1}) # reindex([True,False])

Unnamed: 0_level_0,a,b,c,d,e
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.833326,0.161183,0.099452,-0.657461,-0.109955


In [28]:
df = pd.DataFrame(np.random.randn(4,5))
df.columns = [['US','US','US','JP','JP'],[1,3,5,1,3]]
df.columns.names = ['city','tensor']
display(df)
df.groupby(level=1,axis=1).count()

city,US,US,US,JP,JP
tensor,1,3,5,1,3
0,1.90599,-0.1932,0.212398,-0.923484,0.189768
1,0.88918,1.192897,-0.366767,-0.46493,0.526286
2,-0.339675,-1.579391,-0.982688,-1.539809,1.55769
3,-0.602548,-0.427592,1.469044,-0.555536,-0.97792


tensor,1,3,5
0,2,2,1
1,2,2,1
2,2,2,1
3,2,2,1
