# CHAPTAR 10 - DATA AGGREGATION AND GROUP OPERATIONS

In [2]:
import pandas as pd
import numpy as np 

In [3]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)
                   })

In [4]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.409084,-0.549369
1,a,two,-0.564068,0.914947
2,b,one,0.461127,0.419971
3,b,two,-0.142463,-1.337168
4,a,one,-0.214141,-0.392309


In [12]:
grouped = df['data1'].groupby(df['key1'])

In [13]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f47e5123f40>

In [15]:
grouped.mean()

key1
a    0.210292
b    0.159332
Name: data1, dtype: float64

In [17]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [18]:
means

key1  key2
a     one     0.597472
      two    -0.564068
b     one     0.461127
      two    -0.142463
Name: data1, dtype: float64

In [19]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.597472,-0.564068
b,0.461127,-0.142463


In [20]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.210292,-0.00891
b,0.159332,-0.458599


In [22]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.597472,-0.470839
a,two,-0.564068,0.914947
b,one,0.461127,0.419971
b,two,-0.142463,-1.337168


In [23]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## INTERATING OVER GROUPS

In [25]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.409084 -0.549369
1    a  two -0.564068  0.914947
4    a  one -0.214141 -0.392309
b
  key1 key2     data1     data2
2    b  one  0.461127  0.419971
3    b  two -0.142463 -1.337168


In [27]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
  key1 key2     data1     data2
0    a  one  1.409084 -0.549369
4    a  one -0.214141 -0.392309
a two
  key1 key2     data1     data2
1    a  two -0.564068  0.914947
b one
  key1 key2     data1     data2
2    b  one  0.461127  0.419971
b two
  key1 key2     data1     data2
3    b  two -0.142463 -1.337168


In [29]:
pieces = dict(list(df.groupby('key1')))

In [31]:
pieces['a']

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.409084,-0.549369
1,a,two,-0.564068,0.914947
4,a,one,-0.214141,-0.392309


In [32]:
type(pieces['a'])

pandas.core.frame.DataFrame

In [33]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [34]:
grouped = df.groupby(df.dtypes, axis=1)

In [38]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f47e448e100>

In [39]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  1.409084 -0.549369
1 -0.564068  0.914947
2  0.461127  0.419971
3 -0.142463 -1.337168
4 -0.214141 -0.392309
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## SELECTING A CLOUMN OR SUBSET OF COLUMNS

In [42]:
df.groupby('key1')['data1']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f47e47a5d60>

In [43]:
df.groupby('key1')[['data1']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f47e47a5610>

In [44]:
df.groupby('key1')['data1'].mean()

key1
a    0.210292
b    0.159332
Name: data1, dtype: float64

In [45]:
df.groupby('key1')[['data1']].mean()

Unnamed: 0_level_0,data1
key1,Unnamed: 1_level_1
a,0.210292
b,0.159332


In [46]:
df.groupby('key1')[['data1']].head()

Unnamed: 0,data1
0,1.409084
1,-0.564068
2,0.461127
3,-0.142463
4,-0.214141


In [48]:
df['data1'].groupby(df['key1'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f47e4d11e80>

In [49]:
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f47e48fcbb0>

In [51]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.470839
a,two,0.914947
b,one,0.419971
b,two,-1.337168


## GROUPING WITH DICTS AND SERIES

In [53]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']
                      )

In [54]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.376942,0.593537,-0.163805,1.215964,1.114112
Steve,-1.68973,1.014497,3.591184,-0.190666,1.267457
Wes,0.632058,1.069136,-0.499502,0.976368,-0.255175
Jim,0.17521,-1.033968,0.235336,-0.259195,-0.154208
Travis,0.441842,2.140923,-1.207755,-1.127811,1.491529


In [57]:
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values

In [58]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.376942,0.593537,-0.163805,1.215964,1.114112
Steve,-1.68973,1.014497,3.591184,-0.190666,1.267457
Wes,0.632058,,,0.976368,-0.255175
Jim,0.17521,-1.033968,0.235336,-0.259195,-0.154208
Travis,0.441842,2.140923,-1.207755,-1.127811,1.491529


In [56]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [71]:
people.groupby(mapping, axis=1).sum()

Unnamed: 0,blue,red
Joe,1.052158,0.330707
Steve,3.400518,0.592224
Wes,0.976368,0.376883
Jim,-0.02386,-1.012966
Travis,-2.335566,4.074295


In [73]:
map_series = pd.Series(mapping)

In [74]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [75]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


## GROUPING WITH FUNCTIONS

In [77]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-0.569673,-0.440432,0.07153,1.933136,0.704729
5,-1.68973,1.014497,3.591184,-0.190666,1.267457
6,0.441842,2.140923,-1.207755,-1.127811,1.491529


In [78]:
key_list = ['one', 'one', 'one', 'two', 'two']

In [79]:
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-1.376942,0.593537,-0.163805,0.976368,-0.255175
3,two,0.17521,-1.033968,0.235336,-0.259195,-0.154208
5,one,-1.68973,1.014497,3.591184,-0.190666,1.267457
6,two,0.441842,2.140923,-1.207755,-1.127811,1.491529


## GROUPING BY INDEX LEVELS 

In [86]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])

In [87]:
hier_df = pd.DataFrame(np.random.randn(4,5), columns=columns)

In [88]:
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-1.998446,0.726953,0.590445,-0.073148,-0.055749
1,0.663994,-0.372361,0.113784,-0.361384,0.042564
2,-1.045035,-0.480192,1.320869,-0.692187,1.338442
3,-1.005002,1.138283,-0.454171,1.555592,-0.048439


In [89]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## DATA AGREGATION

In [90]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.409084,-0.549369
1,a,two,-0.564068,0.914947
2,b,one,0.461127,0.419971
3,b,two,-0.142463,-1.337168
4,a,one,-0.214141,-0.392309


In [91]:
grouped = df.groupby('key1')

In [92]:
grouped['data1'].quantile(0.9)

key1
a    1.084439
b    0.400768
Name: data1, dtype: float64

In [93]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [94]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.973152,1.464316
b,0.60359,1.75714


In [98]:
df.data1.max()

1.4090842914958823

In [99]:
df.data1.min()

-0.5640675671123234

In [103]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.210292,1.052825,-0.564068,-0.389104,-0.214141,0.597472,1.409084,3.0,-0.00891,0.803929,-0.549369,-0.470839,-0.392309,0.261319,0.914947
b,2.0,0.159332,0.426802,-0.142463,0.008435,0.159332,0.310229,0.461127,2.0,-0.458599,1.242485,-1.337168,-0.897884,-0.458599,-0.019314,0.419971
