# Data Aggregation and Group Operations

In [2]:
import pandas as pd 
import numpy as np

## GroupBy mechanics

In [6]:
df = pd.DataFrame({'key1' : list('aabba'),
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                  'balance' : np.random.randn(5) * 10,
                   'income' : np.random.randn(5) + 2
                  })

df

Unnamed: 0,balance,income,key1,key2
0,-3.570696,0.608132,a,one
1,12.863333,3.596167,a,two
2,5.253448,1.454775,b,one
3,6.851577,2.950287,b,two
4,-3.649933,4.120578,a,one


In [7]:
df.mean()

balance    3.549546
income     2.545988
dtype: float64

In [14]:
means = df.groupby('key1').mean()
means

Unnamed: 0_level_0,balance,income
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.880901,2.774959
b,6.052513,2.202531


In [12]:
mean_key1 = df.groupby('key1')['balance'].mean()
mean_key1

key1
a    1.880901
b    6.052513
Name: balance, dtype: float64

In [13]:
mean_key1['a']

1.8809013841247915

In [17]:
means

Unnamed: 0_level_0,balance,income
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.880901,2.774959
b,6.052513,2.202531


In [16]:
means['balance']['a']

1.8809013841247915

In [19]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,income
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-3.610314,2.364355
a,two,12.863333,3.596167
b,one,5.253448,1.454775
b,two,6.851577,2.950287


In [22]:
df.groupby(['key1', 'key2']).agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,balance,income,income
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,one,-3.610314,2,2.364355,2
a,two,12.863333,1,3.596167,1
b,one,5.253448,1,1.454775,1
b,two,6.851577,1,2.950287,1


In [28]:
df

Unnamed: 0,balance,income,key1,key2
0,-3.570696,0.608132,a,one
1,12.863333,3.596167,a,two
2,5.253448,1.454775,b,one
3,6.851577,2.950287,b,two
4,-3.649933,4.120578,a,one


We can provide arbitrary functions to the .agg() method of groupby objects:

In [30]:
df.groupby('key1')['key2'].agg(lambda strseries: strseries.str.len().sum())

key1
a    9
b    6
Name: key2, dtype: int64

### Iterating over groups

In [34]:
for value, groupmembers in df.groupby('key1'):
    print(value)
    print(groupmembers)

a
     balance    income key1 key2
0  -3.570696  0.608132    a  one
1  12.863333  3.596167    a  two
4  -3.649933  4.120578    a  one
b
    balance    income key1 key2
2  5.253448  1.454775    b  one
3  6.851577  2.950287    b  two


In [39]:
list(df.groupby('key1'))

[('a',      balance    income key1 key2
  0  -3.570696  0.608132    a  one
  1  12.863333  3.596167    a  two
  4  -3.649933  4.120578    a  one), ('b',     balance    income key1 key2
  2  5.253448  1.454775    b  one
  3  6.851577  2.950287    b  two)]

In [40]:
dict(list(df.groupby('key1')))

{'a':      balance    income key1 key2
 0  -3.570696  0.608132    a  one
 1  12.863333  3.596167    a  two
 4  -3.649933  4.120578    a  one, 'b':     balance    income key1 key2
 2  5.253448  1.454775    b  one
 3  6.851577  2.950287    b  two}

## Data aggregation

In [42]:
import requests

url = 'https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv'
response = requests.get(url)

out_file = open('tips.csv', 'wb')
out_file.write(response.content)
out_file.close()

In [43]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


#### Exercise

Extract the amount of tip expressed as % of the bill. Compare the average value for men and women. Are they substatially different? What values would you need to calculate a p-value, assuming a normal distribution?

In [45]:
tips['tip_pct'] = 100 * tips['tip'] / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


In [47]:
tips.groupby('sex')['tip_pct'].mean()

sex
Female    16.649074
Male      15.765055
Name: tip_pct, dtype: float64

In [49]:
tips.groupby('sex')['tip_pct'].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,mean,std,count
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,16.649074,5.363173,87
Male,15.765055,6.477787,157


In [54]:
stacked = df.groupby(['key1', 'key2']).mean()
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,income
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-3.610314,2.364355
a,two,12.863333,3.596167
b,one,5.253448,1.454775
b,two,6.851577,2.950287


In [56]:
stacked.unstack('key1')

Unnamed: 0_level_0,balance,balance,income,income
key1,a,b,a,b
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,-3.610314,5.253448,2.364355,1.454775
two,12.863333,6.851577,3.596167,2.950287


In [59]:
stacked.unstack('key1').unstack('balance')

         key1  key2
balance  a     one     -3.610314
               two     12.863333
         b     one      5.253448
               two      6.851577
income   a     one      2.364355
               two      3.596167
         b     one      1.454775
               two      2.950287
dtype: float64

In [66]:
tips.pivot(columns='sex')

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,smoker,smoker,day,day,time,time,size,size,tip_pct,tip_pct
sex,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male
0,16.99,,1.01,,No,,Sun,,Dinner,,2.0,,5.944673,
1,,10.34,,1.66,,No,,Sun,,Dinner,,3.0,,16.054159
2,,21.01,,3.50,,No,,Sun,,Dinner,,3.0,,16.658734
3,,23.68,,3.31,,No,,Sun,,Dinner,,2.0,,13.978041
4,24.59,,3.61,,No,,Sun,,Dinner,,4.0,,14.680765,
5,,25.29,,4.71,,No,,Sun,,Dinner,,4.0,,18.623962
6,,8.77,,2.00,,No,,Sun,,Dinner,,2.0,,22.805017
7,,26.88,,3.12,,No,,Sun,,Dinner,,4.0,,11.607143
8,,15.04,,1.96,,No,,Sun,,Dinner,,2.0,,13.031915
9,,14.78,,3.23,,No,,Sun,,Dinner,,2.0,,21.853857


### Column-wise and multiple function application

## Group-wise operations and transformations

### Apply: General split-apply-combine

#### Suppressing the group keys

### Quantile and bucket analysis

### Example: Filling missing values with group-specific values

## Pivot tables and Cross-tabulation