# pd Grouper
-  Grouper
   -  https://pandas.pydata.org/docs/reference/api/pandas.Grouper
   -  A Grouper allows the user to specify a groupby instruction for an object
   -  pandas.Grouper(*args, **kwargs) 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pydataset import data
import seaborn as sns

In [35]:
mtcars = data('mtcars')
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
survival = data('survival')
survival.head()

Unnamed: 0,dose,surv
1,117.5,44.0
2,117.5,55.0
3,235.0,16.0
4,235.0,13.0
5,470.0,4.0


In [4]:
titanic = data('titanic')
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [6]:
titanic.groupby(pd.Grouper(key='sex')).size()

sex
man      869
women    447
dtype: int64

In [7]:
titanic.groupby('sex').size()

sex
man      869
women    447
dtype: int64

In [8]:
# TS data
start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
rng = pd.date_range(start, end, freq='7min')
ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
ts

2000-10-01 23:30:00     0
2000-10-01 23:37:00     3
2000-10-01 23:44:00     6
2000-10-01 23:51:00     9
2000-10-01 23:58:00    12
2000-10-02 00:05:00    15
2000-10-02 00:12:00    18
2000-10-02 00:19:00    21
2000-10-02 00:26:00    24
Freq: 7T, dtype: int64

In [10]:
ts.groupby(pd.Grouper(freq='17min')).sum()

2000-10-01 23:14:00     0
2000-10-01 23:31:00     9
2000-10-01 23:48:00    21
2000-10-02 00:05:00    54
2000-10-02 00:22:00    24
Freq: 17T, dtype: int64

In [11]:
ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()

2000-10-01 23:30:00     9
2000-10-01 23:47:00    21
2000-10-02 00:04:00    54
2000-10-02 00:21:00    24
Freq: 17T, dtype: int64

In [12]:
ts.groupby(pd.Grouper(freq='17min', offset='23h35min')).sum()

2000-10-01 23:18:00     0
2000-10-01 23:35:00    18
2000-10-01 23:52:00    27
2000-10-02 00:09:00    39
2000-10-02 00:26:00    24
Freq: 17T, dtype: int64

In [13]:
ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()

2000-10-01 23:16:00     0
2000-10-01 23:33:00     9
2000-10-01 23:50:00    36
2000-10-02 00:07:00    39
2000-10-02 00:24:00    24
Freq: 17T, dtype: int64

In [16]:
ts.groupby(pd.Grouper(freq='10min', offset='1m')).sum()

2000-10-01 23:21:00     0
2000-10-01 23:31:00     3
2000-10-01 23:41:00     6
2000-10-01 23:51:00    21
2000-10-02 00:01:00    15
2000-10-02 00:11:00    39
2000-10-02 00:21:00    24
Freq: 10T, dtype: int64

In [17]:
start

'2000-10-01 23:30:00'

# namedAggregator

In [18]:
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [21]:
aggSurv = pd.NamedAgg(column='survived', aggfunc='count')

In [27]:
titanic.groupby('age').agg(survived1= aggSurv)
#data points in Survived

Unnamed: 0_level_0,survived1
age,Unnamed: 1_level_1
adults,1207
child,109


In [33]:
titanic.groupby('age').agg(survived1= aggSurv).count()

survived1    2
dtype: int64

In [28]:
titanic.groupby(['age', 'survived']).size()

age     survived
adults  no          765
        yes         442
child   no           52
        yes          57
dtype: int64

# Series Groupby
- https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.SeriesGroupBy.apply.html

In [40]:
mtcars.mpg.head()
#.apply(lambda x: x.max() - x.min())

Mazda RX4            21.0
Mazda RX4 Wag        21.0
Datsun 710           22.8
Hornet 4 Drive       21.4
Hornet Sportabout    18.7
Name: mpg, dtype: float64

In [41]:
mtcarsG = mtcars.set_index('gear')
mtcarsG

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,carb
gear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4
4,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4
4,22.8,4,108.0,93,3.85,2.32,18.61,1,1,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,1
3,18.7,8,360.0,175,3.15,3.44,17.02,0,0,2
3,18.1,6,225.0,105,2.76,3.46,20.22,1,0,1
3,14.3,8,360.0,245,3.21,3.57,15.84,0,0,4
4,24.4,4,146.7,62,3.69,3.19,20.0,1,0,2
4,22.8,4,140.8,95,3.92,3.15,22.9,1,0,2
4,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4


In [43]:
gs1 = mtcarsG.mpg
gs1.head()

gear
4    21.0
4    21.0
4    22.8
3    21.4
3    18.7
Name: mpg, dtype: float64

In [44]:
g1 = gs1.groupby(gs1.index, group_keys=False)
g1

<pandas.core.groupby.generic.SeriesGroupBy object at 0x176696690>

In [46]:
g1.apply(lambda x : x.max() - x.min())
#difference in max and min mpg for each gp

gear
3    11.1
4    16.1
5    15.4
Name: mpg, dtype: float64

In [48]:
mtcars.groupby('gear',as_index=False).agg(minMPG = ('mpg','min'), maxMPG = ('mpg','max'))

Unnamed: 0,gear,minMPG,maxMPG
0,3,10.4,21.5
1,4,17.8,33.9
2,5,15.0,30.4


In [76]:
#simple functions
f= lambda x : x.maxMPG - x.minMPG

In [77]:
mtcars.groupby('gear',as_index=False).agg(minMPG = ('mpg','min'), maxMPG = ('mpg','max')).apply(f, axis=1)

0    11.1
1    16.1
2    15.4
dtype: float64

In [78]:
#for complex functions
def f(x):
    x['mpgDiff'] = x['maxMPG'] - x['minMPG']
    return x['mpgDiff']

In [79]:
mtcars.groupby('gear',as_index=False).agg(minMPG = ('mpg','min'), maxMPG = ('mpg','max')).apply(f, axis=1)

0    11.1
1    16.1
2    15.4
dtype: float64

In [83]:
#!pip install swifter
import swifter

In [82]:
def fnc(a, b):
    return (a + b)/2 

In [84]:
mtcars.swifter.apply(lambda x : fnc(x.mpg, x.hp), axis=1)

Mazda RX4               65.50
Mazda RX4 Wag           65.50
Datsun 710              57.90
Hornet 4 Drive          65.70
Hornet Sportabout       96.85
Valiant                 61.55
Duster 360             129.65
Merc 240D               43.20
Merc 230                58.90
Merc 280                71.10
Merc 280C               70.40
Merc 450SE              98.20
Merc 450SL              98.65
Merc 450SLC             97.60
Cadillac Fleetwood     107.70
Lincoln Continental    112.70
Chrysler Imperial      122.35
Fiat 128                49.20
Honda Civic             41.20
Toyota Corolla          49.45
Toyota Corona           59.25
Dodge Challenger        82.75
AMC Javelin             82.60
Camaro Z28             129.15
Pontiac Firebird        97.10
Fiat X1-9               46.65
Porsche 914-2           58.50
Lotus Europa            71.70
Ford Pantera L         139.90
Ferrari Dino            97.35
Maserati Bora          175.00
Volvo 142E              65.20
dtype: float64