In [1]:
import pandas as pd
import numpy as np

raw_data = {
    "key": np.tile(list("abc"), 4),
    "value": np.random.choice(range(12), 12)
}
data = pd.DataFrame(raw_data)
data

Unnamed: 0,key,value
0,a,7
1,b,2
2,c,5
3,a,8
4,b,8
5,c,1
6,a,5
7,b,5
8,c,5
9,a,1


In [2]:
grouped_value = data.groupby("key").value
grouped_value.max()

key
a    8
b    8
c    5
Name: value, dtype: int64

#### Group Transform using lambda function

In [3]:
grouped_value.transform(lambda x: x.max()) ## is similar to aggeration except it is broadcasted for all keys

0     8
1     8
2     5
3     8
4     8
5     5
6     8
7     8
8     5
9     8
10    8
11    5
Name: value, dtype: int64

#### Group transform using aggregation function name

In [4]:
grouped_value.transform("mean")

0     5.25
1     5.00
2     3.50
3     5.25
4     5.00
5     3.50
6     5.25
7     5.00
8     3.50
9     5.25
10    5.00
11    3.50
Name: value, dtype: float64

#### Group values seen by transform functions

In [5]:
grouped_value.transform(lambda x: ",".join(x.apply(str))) ## transform calls the lambda function with group values for each key

0     7,8,5,1
1     2,8,5,5
2     5,1,5,3
3     7,8,5,1
4     2,8,5,5
5     5,1,5,3
6     7,8,5,1
7     2,8,5,5
8     5,1,5,3
9     7,8,5,1
10    2,8,5,5
11    5,1,5,3
Name: value, dtype: object

In [6]:
grouped_value.transform(lambda x: x.rank()) ## gives rank of each value in the group

0     3.0
1     1.0
2     3.5
3     4.0
4     4.0
5     1.0
6     2.0
7     2.5
8     3.5
9     1.0
10    2.5
11    2.0
Name: value, dtype: float64

#### Group transform using custom function

In [7]:
def normalize(x):
    return (x-x.mean())/x.std()
normalize(data.value)

0     0.994798
1    -1.063405
2     0.171517
3     1.406439
4     1.406439
5    -1.475046
6     0.171517
7     0.171517
8     0.171517
9    -1.475046
10    0.171517
11   -0.651764
Name: value, dtype: float64

In [8]:
grouped_value.transform(normalize)

0     0.565301
1    -1.224745
2     0.783349
3     0.888330
4     1.224745
5    -1.305582
6    -0.080757
7     0.000000
8     0.783349
9    -1.372874
10    0.000000
11   -0.261116
Name: value, dtype: float64

In [9]:
grouped_value.apply(normalize)

0     0.565301
1    -1.224745
2     0.783349
3     0.888330
4     1.224745
5    -1.305582
6    -0.080757
7     0.000000
8     0.783349
9    -1.372874
10    0.000000
11   -0.261116
Name: value, dtype: float64

#### Grouped time resampling

In [24]:
N=25
minutes = pd.date_range("9:00", freq="1min", periods=N)
values = np.arange(N)
group_keys = np.random.choice(list("abcd"), N)
data_key = pd.DataFrame({"value": values, "group": group_keys}, index=minutes)
data = data_key[["value"]].copy()

In [29]:
data.resample("5min").sum() ## Resampling without group

Unnamed: 0,value
2019-04-12 09:00:00,10
2019-04-12 09:05:00,35
2019-04-12 09:10:00,60
2019-04-12 09:15:00,85
2019-04-12 09:20:00,110


In [33]:
time_grouper = pd.Grouper(freq="5min")

data_key.groupby([time_grouper,"group"]).count() ## index of data_key must be time

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Unnamed: 0_level_1,group,Unnamed: 2_level_1
2019-04-12 09:00:00,a,1
2019-04-12 09:00:00,b,1
2019-04-12 09:00:00,c,1
2019-04-12 09:00:00,d,2
2019-04-12 09:05:00,a,2
2019-04-12 09:05:00,b,1
2019-04-12 09:05:00,c,1
2019-04-12 09:05:00,d,1
2019-04-12 09:10:00,a,2
2019-04-12 09:10:00,c,1


#### Chaining

In [58]:
 data_key[lambda x: x>4] \
    .assign(new_col=lambda x: x.value-x.value.mean()) \ # assign will make new copy and add given column
    .groupby("group").new_col.std()

group
a    6.000000
b    9.192388
c    6.572671
d    6.041523
Name: new_col, dtype: float64