# Aggregation and Grouping

In [1]:
import numpy as np
import pandas as pd

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [5]:
! pip install seaborn

In [6]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [7]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [8]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


### GroupBy

In [10]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)})
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [11]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000001B16E3D0>

In [12]:
for (key, group) in df.groupby('key'):
    print(f'{key}: \n {group}')

A: 
   key  data
0   A     0
3   A     3
B: 
   key  data
1   B     1
4   B     4
C: 
   key  data
2   C     2
5   C     5


In [13]:
df.groupby('key').mean()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5


In [14]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [16]:
planets.groupby('method')['year'].max() # Mais recentes descobertas por método.

method
Astrometry                       2013
Eclipse Timing Variations        2012
Imaging                          2013
Microlensing                     2013
Orbital Brightness Modulation    2013
Pulsar Timing                    2011
Pulsation Timing Variations      2007
Radial Velocity                  2014
Transit                          2014
Transit Timing Variations        2014
Name: year, dtype: int64

### Aggregate

In [17]:
%%timeit
planets.groupby('method')['year'].aggregate([np.min,np.median,np.max])

1.27 ms ± 5.91 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [18]:
%%timeit
planets.groupby('method')['year'].aggregate([min,np.median,max])

1.27 ms ± 9.52 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Filtering

In [19]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [20]:
def filter_func(x):
    return x['data2'].std() > 4

In [22]:
for (key, group) in df.groupby('key'):
    print(f'Key = {key} \n {group}')

Key = A 
   key  data1  data2
0   A      0      5
3   A      3      3
Key = B 
   key  data1  data2
1   B      1      0
4   B      4      7
Key = C 
   key  data1  data2
2   C      2      3
5   C      5      9


In [23]:
display('df',"df.groupby('key').std()","df.groupby('key').filter(filter_func)")

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


### Specifying the split key

#### List of indexes

In [26]:
# Tudo indicado que o agrupamento é realizado da seguinte forma:
# Para cada grupo indicado pelo índice da lista, linha a linha do dataframe vai sendo agrupado.
# Para o exemplo, temos os grupos 0, 1 e 2.
# Então, a linha 0 do dataframa é colocada no grupo 0, a linha 1 no grupo 1, a linha 2 no grupo 0,
# a linha 3 no grupo 1, a linha 4 no grupo 2 e a linha 5 no grupo 0.
# Os elementos da lista representam os bins, em que os elementos do dataframe devem ser colocados
# linha a linha.
L = [0, 1, 0, 1, 2, 0]
display('df','df.groupby(L).sum()')

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0,key,data1,data2
0,ACC,7,17
1,BA,4,3
2,B,4,7


### Dictionary

In [27]:
df2 = df.set_index('key')
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


In [28]:
mapping = {'A':'vowel','B':'consonant','C':'consonant'}
display('df2','df2.groupby(mapping).sum()')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,19
vowel,3,8


### Any python function

In [29]:
display('df2','df2.groupby(str.lower).sum()')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,8
b,5,7
c,7,12


In [30]:
planets.shape

(1035, 6)

In [31]:
decade = (planets['year'] // 10) * 10
planets['decade'] = decade
planets.groupby(['method','decade'])['number'].count().unstack().fillna(0)

decade,1980,1990,2000,2010
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,3.0,6.0
Imaging,0.0,0.0,20.0,18.0
Microlensing,0.0,0.0,10.0,13.0
Orbital Brightness Modulation,0.0,0.0,0.0,3.0
Pulsar Timing,0.0,3.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,28.0,309.0,215.0
Transit,0.0,0.0,62.0,335.0
Transit Timing Variations,0.0,0.0,0.0,4.0
