In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

In [2]:
seed = 42
rng = np.random.RandomState(seed)

In [3]:
df = pd.DataFrame(
    {
        'key': ['A','B','C','A','B','C'],
        'data': range(6),
    },
    columns=['key','data'],
)

df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [4]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7faafb7916d8>

In [5]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [6]:
df.groupby('key').describe()

Unnamed: 0_level_0,data,data,data,data,data,data,data,data
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,2.0,1.5,2.12132,0.0,0.75,1.5,2.25,3.0
B,2.0,2.5,2.12132,1.0,1.75,2.5,3.25,4.0
C,2.0,3.5,2.12132,2.0,2.75,3.5,4.25,5.0


In [7]:
df = pd.DataFrame(
    {
        'key': ['A','B','C','A','B','C'],
        'data1': range(6),
        'data2': rng.randint(0, 10, 6),
    },
    columns=['key', 'data1', 'data2'],
)

df

Unnamed: 0,key,data1,data2
0,A,0,6
1,B,1,3
2,C,2,7
3,A,3,4
4,B,4,6
5,C,5,9


In [8]:
df.groupby('key').aggregate(['min', np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,4,5.0,6
B,1,2.5,4,3,4.5,6
C,2,3.5,5,7,8.0,9


In [9]:
df.groupby('key') \
    .aggregate(
        {
            'data1': 'min',
            'data2': 'max',
        }
    )

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,6
B,1,6
C,2,9


In [10]:
df.groupby('key') \
    .aggregate(
        {
            'data1': 'min',
            'data2': ['min', np.median, max],
        }
    )

Unnamed: 0_level_0,data1,data2,data2,data2
Unnamed: 0_level_1,min,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,0,4,5.0,6
B,1,3,4.5,6
C,2,7,8.0,9


In [11]:
def filter_func(x):
    return x['data2'].std() > 3

print(df)
print(df.groupby('key').std())

print(df.groupby('key').filter(filter_func))

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
       data1     data2
key                   
A    2.12132  1.414214
B    2.12132  2.121320
C    2.12132  1.414214
Empty DataFrame
Columns: [key, data1, data2]
Index: []


In [12]:
# Subtracting group-.wise mean.
df.groupby('key').\
    transform(lambda x: x - x.mean())

Unnamed: 0,data1,data2
0,-1.5,1.0
1,-1.5,-1.5
2,-1.5,-1.0
3,1.5,-1.0
4,1.5,1.5
5,1.5,1.0


In [13]:
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x

print(df)
print(df.groupby('key').apply(norm_by_data2))

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
  key     data1  data2
0   A  0.000000      6
1   B  0.111111      3
2   C  0.125000      7
3   A  0.300000      4
4   B  0.444444      6
5   C  0.312500      9


In [14]:
n = df.shape[0]

# min_val, max_val = 0, 3
# L = rng.randint(min_val, max_val, n)

L = [0, 1, 0, 1, 2, 0]

assert len(L) == n

print(df)
print(df.groupby(L).sum())

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
   data1  data2
0      7     22
1      4      7
2      4      6


In [15]:
print(df)
print(df.groupby('key').sum())

  key  data1  data2
0   A      0      6
1   B      1      3
2   C      2      7
3   A      3      4
4   B      4      6
5   C      5      9
     data1  data2
key              
A        3     10
B        5      9
C        7     16


In [16]:
df2 = df.set_index('key')

mapping = {
    'A': 'vowel',
    'B': 'consonant',
    'C': 'vowel',
}

print(df2)
print(df2.groupby(mapping).sum())

     data1  data2
key              
A        0      6
B        1      3
C        2      7
A        3      4
B        4      6
C        5      9
           data1  data2
consonant      5      9
vowel         10     26


In [17]:
print(df2)
print(df2.groupby(str.lower).mean())

     data1  data2
key              
A        0      6
B        1      3
C        2      7
A        3      4
B        4      6
C        5      9
   data1  data2
a    1.5    5.0
b    2.5    4.5
c    3.5    8.0


In [18]:
print(df2)
print(df2.groupby([str.lower, mapping]).mean())

     data1  data2
key              
A        0      6
B        1      3
C        2      7
A        3      4
B        4      6
C        5      9
             data1  data2
a vowel        1.5    5.0
b consonant    2.5    4.5
c vowel        3.5    8.0
