# Data Aggregation and Group Operations

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'key1': list('aabba'),
    'key2': ['guitar', 'cello', 'guitar', 'cello', 'guitar'],
    'data1': np.random.randn(5) * 20,
    'data2': np.random.randn(5) * 10 + 10
})

## GroupBy mechanics

In [3]:
grouped = df.groupby('key1')
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x7f032ff2a898>

In [4]:
grouped.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,13.609793,9.624975
b,-2.344541,7.362434


In [5]:
# Not count missing values
grouped.count()

Unnamed: 0_level_0,data1,data2,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [6]:
grouped.std()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,27.459803,12.811135
b,2.725247,4.049644


In [7]:
combined_means = df.groupby(['key1', 'key2']).mean()
combined_means

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,cello,5.934396,14.815216
a,guitar,17.447492,7.029855
b,cello,-4.271582,10.225965
b,guitar,-0.417501,4.498903


In [8]:
combined_means.loc[('a', 'cello')]

data1     5.934396
data2    14.815216
Name: (a, cello), dtype: float64

### Iterating over groups

In [9]:
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x7f032ff2a898>

In [10]:
for key, data in grouped:
    print(key)
    print(data, '\n')

a
       data1      data2 key1    key2
0  -9.195650  -4.966863    a  guitar
1   5.934396  14.815216    a   cello
4  44.090633  19.026572    a  guitar 

b
      data1      data2 key1    key2
2 -0.417501   4.498903    b  guitar
3 -4.271582  10.225965    b   cello 



In [11]:
df.groupby(df['data1'] > 0).mean()

Unnamed: 0_level_0,data1,data2
data1,Unnamed: 1_level_1,Unnamed: 2_level_1
False,-4.628244,3.252669
True,25.012514,16.920894


In [12]:
import random

zoo = pd.DataFrame({
    'animal': random.choices(['mouse', 'mantis', 'giraffes', 'gophers'], k=20),
    'number': [random.randint(3, 67) for _ in range(20)]
})

zoo

Unnamed: 0,animal,number
0,mouse,33
1,gophers,30
2,mantis,27
3,gophers,43
4,giraffes,17
5,mantis,59
6,mantis,63
7,mouse,49
8,gophers,64
9,mantis,59


In [13]:
zoo.groupby('animal').sum()

Unnamed: 0_level_0,number
animal,Unnamed: 1_level_1
giraffes,281
gophers,268
mantis,243
mouse,144


In [14]:
zoo.groupby(zoo['animal'].str[0]).sum()

Unnamed: 0_level_0,number
animal,Unnamed: 1_level_1
g,549
m,387


### Selecting a column or subset of columns

In [15]:
df.groupby('key2')['data1'].mean()

key2
cello      0.831407
guitar    11.492494
Name: data1, dtype: float64

## Data aggregation

In [16]:
grouped.quantile(.1)

0.1,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-6.169641,-1.010447
b,-3.886174,5.07161


In [17]:
def ptp(series):
    return series.max() - series.min()

grouped['data1'].apply(ptp)

key1
a    53.286283
b     3.854081
Name: data1, dtype: float64

### Column-wise and multiple function application

In [18]:
#!wget https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv

In [19]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [20]:
tips.groupby(['sex', 'smoker'])['tip'].mean()

sex     smoker
Female  No        2.773519
        Yes       2.931515
Male    No        3.113402
        Yes       3.051167
Name: tip, dtype: float64

## Group-wise operations and transformations

In [21]:
tips.groupby(['sex', 'smoker'])['tip'].agg(['mean', 'std', ptp])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,ptp
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,2.773519,1.128425,4.2
Female,Yes,2.931515,1.219916,5.5
Male,No,3.113402,1.489559,7.75
Male,Yes,3.051167,1.50012,9.0


In [22]:
tips.groupby(['sex', 'smoker']).agg(['mean', 'std', ptp])

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,ptp,mean,std,ptp,mean,std,ptp
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Female,No,18.105185,7.286455,28.58,2.773519,1.128425,4.2,2.592593,1.073146,5
Female,Yes,17.977879,9.189751,41.23,2.931515,1.219916,5.5,2.242424,0.613917,3
Male,No,19.791237,8.726566,40.82,3.113402,1.489559,7.75,2.71134,0.989094,4
Male,Yes,22.2845,9.911845,43.56,3.051167,1.50012,9.0,2.5,0.89253,4


In [23]:
tips.groupby(['sex', 'smoker'])['tip'].agg(['mean', 'std', ptp, np.count_nonzero])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,ptp,count_nonzero
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2.773519,1.128425,4.2,54.0
Female,Yes,2.931515,1.219916,5.5,33.0
Male,No,3.113402,1.489559,7.75,97.0
Male,Yes,3.051167,1.50012,9.0,60.0


In [24]:
function_list = [('range', ptp), ('number', np.count_nonzero)]
tips.groupby(['sex', 'smoker'])['tip'].agg(function_list)

Unnamed: 0_level_0,Unnamed: 1_level_0,range,number
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,4.2,54.0
Female,Yes,5.5,33.0
Male,No,7.75,97.0
Male,Yes,9.0,60.0


### Quantile and bucket analysis

In [25]:
df2 = pd.DataFrame({
    'age': [random.randint(17, 99) for _ in range(250)],
    'salary': [random.randint(10000, 90000) for _ in range(250)]
})

In [26]:
pd.cut(df2['age'], bins=(0, 18, 35, 65, 100))

0      (65, 100]
1      (65, 100]
2       (18, 35]
3      (65, 100]
4       (35, 65]
5       (35, 65]
6       (18, 35]
7       (35, 65]
8      (65, 100]
9       (35, 65]
10      (18, 35]
11      (35, 65]
12      (35, 65]
13      (18, 35]
14      (35, 65]
15      (35, 65]
16     (65, 100]
17       (0, 18]
18      (18, 35]
19     (65, 100]
20      (35, 65]
21      (18, 35]
22      (18, 35]
23     (65, 100]
24      (35, 65]
25     (65, 100]
26     (65, 100]
27      (18, 35]
28      (35, 65]
29     (65, 100]
         ...    
220    (65, 100]
221    (65, 100]
222    (65, 100]
223     (35, 65]
224     (35, 65]
225    (65, 100]
226    (65, 100]
227    (65, 100]
228     (35, 65]
229    (65, 100]
230    (65, 100]
231     (18, 35]
232     (35, 65]
233    (65, 100]
234     (35, 65]
235     (18, 35]
236     (35, 65]
237    (65, 100]
238    (65, 100]
239    (65, 100]
240     (18, 35]
241    (65, 100]
242    (65, 100]
243     (35, 65]
244    (65, 100]
245     (35, 65]
246     (35, 65]
247     (35, 6

In [27]:
age_categories = pd.cut(df2['age'], bins=(0, 18, 35, 65, 100))
df2.groupby(age_categories).mean() # Due to index

Unnamed: 0_level_0,age,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 18]",17.4,44594.0
"(18, 35]",27.631579,53370.631579
"(35, 65]",49.6375,45203.6
"(65, 100]",82.504854,46380.135922


In [28]:
df2.groupby(age_categories).agg({
    'age': np.median,
    'salary': np.mean
})

Unnamed: 0_level_0,age,salary
age,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 18]",17.0,44594.0
"(18, 35]",27.0,53370.631579
"(35, 65]",48.5,45203.6
"(65, 100]",82.0,46380.135922


## Pivot tables and Cross-tabulation

In [29]:
tips.pivot_table(index=['sex', 'smoker'], aggfunc='sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,140,149.77,977.68
Female,Yes,74,96.74,593.27
Male,No,263,302.0,1919.75
Male,Yes,150,183.07,1337.07


In [30]:
tips.pivot_table(columns=['sex', 'smoker'], index='size', aggfunc='sum')

Unnamed: 0_level_0,tip,tip,tip,tip,total_bill,total_bill,total_bill,total_bill
sex,Female,Female,Male,Male,Female,Female,Male,Male
smoker,No,Yes,No,Yes,No,Yes,No,Yes
size,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
1,2.83,1.0,,1.92,17.32,3.07,,8.58
2,78.23,68.42,145.78,110.41,500.73,388.96,880.08,796.12
3,26.27,19.23,53.53,29.91,173.67,127.99,372.58,210.31
4,28.1,8.09,80.99,35.83,199.26,73.25,522.74,263.45
5,5.14,,10.0,5.0,29.85,,61.88,58.61
6,9.2,,11.7,,56.85,,82.47,
