In [91]:
import numpy as np
import pandas as pd

# GroupBy: Split, Apply, Combine


In [92]:
np.random.seed(0)

## Apply

Apply a function along an axis of the DataFrame.

Objects passed to the function are Series objects whose index is either

-   the DataFrame’s index (axis=0)
-   the DataFrame’s columns (axis=1)

By default, the final return type is inferred from the return type of the applied function.  
Otherwise, it depends on the result_type argument.


In [93]:
data = [
    [4, 9],
    [4, 9],
    [4, 9],
]

In [94]:
df = pd.DataFrame(
    data,
    columns=["A", "B"],
)

In [95]:
print(df)

   A  B
0  4  9
1  4  9
2  4  9


In [96]:
df.apply(np.sqrt)

Unnamed: 0,A,B
0,2.0,3.0
1,2.0,3.0
2,2.0,3.0


In [97]:
df.apply(
    np.sum,
    axis=0,
)

A    12
B    27
dtype: int64

In [98]:
df.apply(
    np.sum,
    axis=1,
)

0    13
1    13
2    13
dtype: int64

In [99]:
np.sum(
    df,
    axis=1,
)

0    13
1    13
2    13
dtype: int64

## GroupBy

By “group by” we are referring to a process involving one or more of the following steps:

-   Splitting the data into groups based on some criteria.
-   Applying a function to each group independently.
-   Combining the results into a data structure.


In [100]:
keys = ["A", "B", "C", "A", "B", "C"]
names = ["N1", "N1", "N2", "N2", "N3", "N4"]

In [101]:
df = pd.DataFrame(
    {
        "key": keys,
        "name": names,
        "data": range(10, 16),
    },
)

In [102]:
print(df)

  key name  data
0   A   N1    10
1   B   N1    11
2   C   N2    12
3   A   N2    13
4   B   N3    14
5   C   N4    15


In [103]:
for c in ["A", "B", "C"]:
    print(df[df["key"] == c].data.sum())

23
25
27


In [104]:
grouped = df.groupby("key")

In [105]:
for v in grouped:
    print(v)

('A',   key name  data
0   A   N1    10
3   A   N2    13)
('B',   key name  data
1   B   N1    11
4   B   N3    14)
('C',   key name  data
2   C   N2    12
5   C   N4    15)


### Aggregate

Once the GroupBy object has been created, several methods are available to perform computations.

An obvious one is aggregation via the aggregate() or equivalently agg() method:


In [106]:
print(df.groupby("key").aggregate(np.sum))

     name  data
key            
A    N1N2    23
B    N1N3    25
C    N2N4    27


In [107]:
print(df.groupby("key"))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000023B6CA1A190>


In [108]:
print(df.groupby("key").aggregate([np.min, np.max]))

    name     data    
     min max  min max
key                  
A     N1  N2   10  13
B     N1  N3   11  14
C     N2  N4   12  15


In [109]:
print(df.groupby("key").aggregate(np.min))

    name  data
key           
A     N1    10
B     N1    11
C     N2    12


In [110]:
grouped = df.groupby(["key", "name"])

In [111]:
grouped.aggregate(np.min)

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key,name,Unnamed: 2_level_1
A,N1,10
A,N2,13
B,N1,11
B,N3,14
C,N2,12
C,N4,15


### Transform

The transform method returns an object that is indexed the same as the one being grouped.


In [112]:
grouped.transform(lambda x: x**2)

Unnamed: 0,data
0,100
1,121
2,144
3,169
4,196
5,225


### Filter

The filter method returns a subset of the original object.  
Suppose we want to take only elements that belong to groups with a group sum greater than 2.


In [113]:
grouped.filter(lambda x: x.data > 12)

Unnamed: 0,key,name,data
3,A,N2,13
4,B,N3,14
5,C,N4,15


In [114]:
grouped.filter(
    lambda x: x.data > 12,
    dropna=False,
)

   key name  data
0  NaN  NaN   NaN
1  NaN  NaN   NaN
2  NaN  NaN   NaN
3    A   N2  13.0
4    B   N3  14.0
5    C   N4  15.0
