# Data Aggregation and Group Operations

In [1]:
import pandas as pd 
import numpy as np

## 1) GroupBy mechanics

In [2]:
df = pd.DataFrame({'producto' : list('aabba'),
                   'vendedor' : ['Juan', 'Celia', 'Juan', 'Celia', 'Juan'],
                  'balance' : np.random.randn(5) * 10,
                   'income' : np.random.randn(5) + 2
                  })

df

Unnamed: 0,producto,vendedor,balance,income
0,a,Juan,-19.144643,3.138182
1,a,Celia,-9.255637,1.292095
2,b,Juan,10.711755,1.544569
3,b,Celia,-13.743681,1.613169
4,a,Juan,-4.990908,1.626107


In [3]:
df.mean()

producto    (9.3523273237563e-311+1.38696301599405e-310j)
vendedor                                               0j
balance                           (-7.284622853723303+0j)
income                            (1.8428243636937893+0j)
dtype: complex128

In [4]:
means = df.groupby('producto').mean()
means

Unnamed: 0_level_0,balance,income
producto,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-11.130396,2.018795
b,-1.515963,1.578869


In [5]:
type(means)

pandas.core.frame.DataFrame

In [6]:
kk = df.groupby('producto')

In [7]:
type(kk)

pandas.core.groupby.groupby.DataFrameGroupBy

In [8]:
kk.sum()

Unnamed: 0_level_0,balance,income
producto,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-33.391188,6.056384
b,-3.031926,3.157738


In [9]:
mean_producto = df.groupby('producto')['balance'].mean()
mean_producto

producto
a   -11.130396
b    -1.515963
Name: balance, dtype: float64

In [10]:
means['balance']['a']

-11.130395962198923

In [11]:
df.groupby(['producto', 'vendedor']).agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,balance,income,income
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count
producto,vendedor,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,Celia,-9.255637,1,1.292095,1
a,Juan,-12.067775,2,2.382144,2
b,Celia,-13.743681,1,1.613169,1
b,Juan,10.711755,1,1.544569,1


In [12]:
df.groupby(['vendedor', 'producto']).agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,balance,income,income
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count
vendedor,producto,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Celia,a,-9.255637,1,1.292095,1
Celia,b,-13.743681,1,1.613169,1
Juan,a,-12.067775,2,2.382144,2
Juan,b,10.711755,1,1.544569,1


In [13]:
def strseries(serie):
    return serie.str.len().sum()

In [14]:
df.groupby(['producto'])['vendedor'].head()

0     Juan
1    Celia
2     Juan
3    Celia
4     Juan
Name: vendedor, dtype: object

In [15]:
df.groupby(['producto'])['vendedor'].agg(strseries)

producto
a    13
b     9
Name: vendedor, dtype: int64

### 1.2) Iterating over groups

In [16]:
for key, group in df.groupby('producto'):
    print('Tipo de producto: %s' % key)
    print('Datos de producto: \n %s' % group)

Tipo de producto: a
Datos de producto: 
   producto vendedor    balance    income
0        a     Juan -19.144643  3.138182
1        a    Celia  -9.255637  1.292095
4        a     Juan  -4.990908  1.626107
Tipo de producto: b
Datos de producto: 
   producto vendedor    balance    income
2        b     Juan  10.711755  1.544569
3        b    Celia -13.743681  1.613169


In [17]:
list(df.groupby('producto'))

[('a',   producto vendedor    balance    income
  0        a     Juan -19.144643  3.138182
  1        a    Celia  -9.255637  1.292095
  4        a     Juan  -4.990908  1.626107),
 ('b',   producto vendedor    balance    income
  2        b     Juan  10.711755  1.544569
  3        b    Celia -13.743681  1.613169)]

In [18]:
cuentas = dict(list(df.groupby('producto')))
cuentas

{'a':   producto vendedor    balance    income
 0        a     Juan -19.144643  3.138182
 1        a    Celia  -9.255637  1.292095
 4        a     Juan  -4.990908  1.626107,
 'b':   producto vendedor    balance    income
 2        b     Juan  10.711755  1.544569
 3        b    Celia -13.743681  1.613169}

In [19]:
type(cuentas['a'])

pandas.core.frame.DataFrame

## 2) Data aggregation

In [20]:
import requests

url = 'https://raw.githubusercontent.com/wesm/pydata-book/1st-edition/ch08/tips.csv'
response = requests.get(url)

out_file = open('tips.csv', 'wb')
out_file.write(response.content)
out_file.close()

In [21]:
tips = pd.read_csv('tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [22]:
tips.count()

total_bill    244
tip           244
sex           244
smoker        244
day           244
time          244
size          244
dtype: int64

#### Ejercicio

Obtener el porcentaje de propina y analizar la dependencia con las variables sexo y fumadores (*sex* y *smoker*). ¿Se puede ver alguna diferencia de comporamiento entre hombres/mujeres, y si son o no fumadores?

In [26]:
tips['%_tips'] = tips['tip'] * 100 / tips['total_bill']
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,%_tips
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


In [27]:
tips.groupby('sex')['%_tips'].mean()

sex
Female    16.649074
Male      15.765055
Name: %_tips, dtype: float64

In [28]:
tips.groupby('smoker')['%_tips'].mean()

smoker
No     15.932846
Yes    16.319604
Name: %_tips, dtype: float64

In [29]:
tips.groupby(['sex', 'smoker'])['%_tips'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,No,54.0,15.692097,3.642118,5.679667,13.970835,14.969118,18.162966,25.26725
Female,Yes,33.0,18.215035,7.159451,5.643341,15.243902,17.391304,19.821606,41.666667
Male,No,97.0,16.066872,4.184875,7.180385,13.181019,15.760441,18.621974,29.198966
Male,Yes,60.0,15.277118,9.058794,3.563814,10.184496,14.101483,19.169707,71.034483


In [32]:
def peak_to_peak(s):
    return s.max() - s.min()
def rango_normal(s):
    return 4*s.std()

In [35]:
tips.groupby(['sex', 'smoker'])['%_tips'].agg([('media', 'mean'), ('std_dev', 'std'), 'count', ('rango', peak_to_peak), ('rango_95%', rango_normal)])

Unnamed: 0_level_0,Unnamed: 1_level_0,media,std_dev,count,rango,rango_95%
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,No,15.692097,3.642118,54,19.587583,14.568474
Female,Yes,18.215035,7.159451,33,36.023326,28.637805
Male,No,16.066872,4.184875,97,22.018581,16.739501
Male,Yes,15.277118,9.058794,60,67.470669,36.235176


Filtrar por los que han dado mas del 40% de propina

In [37]:
tips[tips['%_tips'] > 40]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,%_tips
172,7.25,5.15,Male,Yes,Sun,Dinner,2,71.034483
178,9.6,4.0,Female,Yes,Sun,Dinner,2,41.666667


In [39]:
tips.groupby('size')['%_tips'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4.0,21.729202,8.034186,13.793103,17.077869,20.275206,24.926539,32.57329
2,156.0,16.571919,6.684824,3.563814,13.522313,15.610418,19.503614,71.034483
3,38.0,15.215685,4.545887,5.643341,12.475755,15.932311,18.613473,23.074192
4,37.0,14.594901,4.239533,7.745933,11.774956,14.669927,16.979656,28.053517
5,5.0,14.149549,6.773266,6.565988,10.657194,12.138869,17.21943,24.166264
6,4.0,15.62292,4.215338,10.379905,13.165446,16.289124,18.746598,19.533528


In [40]:
tips.groupby('day')['%_tips'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fri,19.0,16.991303,4.766531,10.35554,13.373871,15.562472,19.663729,26.348039
Sat,87.0,15.315172,5.129259,3.563814,12.386329,15.183246,18.827082,32.57329
Sun,76.0,16.689729,8.473889,5.944673,11.998208,16.110332,18.788908,71.034483
Thur,62.0,16.127563,3.865182,7.296137,13.820958,15.384615,19.268675,26.631158


### 2.1 Pivot & unstack

In [41]:
stacked = df.groupby(['producto', 'vendedor']).mean()

In [42]:
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,balance,income
producto,vendedor,Unnamed: 2_level_1,Unnamed: 3_level_1
a,Celia,-9.255637,1.292095
a,Juan,-12.067775,2.382144
b,Celia,-13.743681,1.613169
b,Juan,10.711755,1.544569


In [45]:
stacked.unstack('vendedor')

Unnamed: 0_level_0,balance,balance,income,income
vendedor,Celia,Juan,Celia,Juan
producto,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,-9.255637,-12.067775,1.292095,2.382144
b,-13.743681,10.711755,1.613169,1.544569


In [46]:
stacked.unstack('vendedor').columns

MultiIndex(levels=[['balance', 'income'], ['Celia', 'Juan']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=[None, 'vendedor'])

In [54]:
df[df['vendedor'] == 'Celia'].groupby('producto').mean()

Unnamed: 0_level_0,balance,income
producto,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-9.255637,1.292095
b,-13.743681,1.613169


In [55]:
df[df['vendedor'] == 'Celia'].groupby('producto').mean().unstack()

         producto
balance  a           -9.255637
         b          -13.743681
income   a            1.292095
         b            1.613169
dtype: float64

In [56]:
df.pivot(columns='producto')

Unnamed: 0_level_0,vendedor,vendedor,balance,balance,income,income
producto,a,b,a,b,a,b
0,Juan,,-19.144643,,3.138182,
1,Celia,,-9.255637,,1.292095,
2,,Juan,,10.711755,,1.544569
3,,Celia,,-13.743681,,1.613169
4,Juan,,-4.990908,,1.626107,


## Example: Filling missing values with group-specific values

In [57]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']

In [58]:
df_us = pd.DataFrame(
    {
        'states': states,
        'market': ['East'] * 4 + ['West'] * 4,
        'data': [100,82,83,np.nan,20,30,np.nan,np.nan],
    }
)

In [59]:
df_us

Unnamed: 0,states,market,data
0,Ohio,East,100.0
1,New York,East,82.0
2,Vermont,East,83.0
3,Florida,East,
4,Oregon,West,20.0
5,Nevada,West,30.0
6,California,West,
7,Idaho,West,


In [61]:
df_us.groupby('market')['data'].apply(lambda x: x.fillna(x.mean()))

0    100.000000
1     82.000000
2     83.000000
3     88.333333
4     20.000000
5     30.000000
6     25.000000
7     25.000000
Name: data, dtype: float64

In [62]:
df_us2 = df_us.copy()

In [63]:
df_us2['data'] = df_us2.groupby('market')['data'].apply(lambda x: x.fillna(x.mean()))

In [65]:
df_us2

Unnamed: 0,states,market,data
0,Ohio,East,100.0
1,New York,East,82.0
2,Vermont,East,83.0
3,Florida,East,88.333333
4,Oregon,West,20.0
5,Nevada,West,30.0
6,California,West,25.0
7,Idaho,West,25.0


In [66]:
fill_values = {'East': 10, 'West': 200}

In [67]:
fill_func = lambda g: g.fillna(fill_values[g.name])

In [71]:
df_us.groupby('market').apply(fill_func)

Unnamed: 0,states,market,data
0,Ohio,East,100.0
1,New York,East,82.0
2,Vermont,East,83.0
3,Florida,East,10.0
4,Oregon,West,20.0
5,Nevada,West,30.0
6,California,West,200.0
7,Idaho,West,200.0
