## [ Apply: general split-apply-combine ]

- The most genral-purpose GroupBy method is apply.
- apply splits the object being manipulated into pieces, invokes the passed function on each piece, and then attemps to concatenate the pieces


In [34]:
import numpy as np 
import pandas as pd 

# suppose we want to select the top five tip_pct values by group
# first write a function that selects the rows with the largest values in a particular column

def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]

tips = pd.read_csv("examples/tips.csv")
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [35]:
# now if we group by smoker, say, and call apply with this functions, we get
tips.groupby("smoker").apply(top)

# first, the tips dataframe is split into groups based on the value of smoker.
# then the top function is called on each group, and the results of each function call are glued together using pd.concat, labeling the pieces with the group names
# the result therefore has a hierarchical index 

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [36]:
# if you pass a function to apply that takes other arguments or keywords, you can pass these after the function
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

  tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


- beyond these basic usage mechanics, getting the most out of apply may require some creativity
- what occurs inside the function passed is up to you, it must either return a pandas object or a scalar value

In [37]:
# recall calling of describe on a GroupBy object
result = tips.groupby("smoker")["tip_pct"].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [38]:
result.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [39]:
# inside GroupBy, when you invoke a method like describe, it is actually just a shortcut for
        # def f(group):
        #     return group.describe()
        # grouped.apply(f)

## [ Supressing the Group Keys ]

In [40]:
# In the preceding examples, you see that the resulting object has a hierarchical index formed from the group keys, along with the indexes of each piece of the original object. You can disable this by passing group_keys=False to groupby:

tips.groupby("smoker", group_keys=False).apply(top)

  tips.groupby("smoker", group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


## [ Quantile and Bucket Analysis ]

- you may know, pandas has some tools, in particular pandas.cut
and pandas.qcut, for slicing data up into buckets with bins of yourchoosing, or by sample quantiles. 
- Combining these functions with groupby makes it convenient to
perform bucket or quantile analysis on a dataset. 

In [41]:
# Consider a simple random dataset and an equal-length bucket categorization using pandas.cut

frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,0.038794,0.286698
1,0.362945,-1.249519
2,1.104102,-0.401932
3,-0.125759,0.152991
4,-1.533233,0.042157


In [42]:
quartiles = pd.cut(frame["data1"], 4)
print(quartiles.value_counts())
quartiles.head(10)

data1
(-0.231, 1.287]    488
(-1.75, -0.231]    374
(1.287, 2.806]      98
(-3.274, -1.75]     40
Name: count, dtype: int64


0    (-0.231, 1.287]
1    (-0.231, 1.287]
2    (-0.231, 1.287]
3    (-0.231, 1.287]
4    (-1.75, -0.231]
5     (1.287, 2.806]
6     (1.287, 2.806]
7    (-0.231, 1.287]
8    (-1.75, -0.231]
9    (-0.231, 1.287]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.274, -1.75] < (-1.75, -0.231] < (-0.231, 1.287] < (1.287, 2.806]]

In [43]:
# the categorical object returned by cut can be passed directly to groupby
# so we could compute a set of group statistics for the quartiles, like so:
def get_stats(group):
    return pd.DataFrame({"min": group.min(), 
                         "max": group.max(),
                         "count": group.count(),
                         "mean": group.mean()})
grouped = frame.groupby(quartiles)

grouped.apply(get_stats)

  grouped = frame.groupby(quartiles)


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.274, -1.75]",data1,-3.268083,-1.76893,40,-2.276343
"(-3.274, -1.75]",data2,-2.800089,3.238439,40,-0.043268
"(-1.75, -0.231]",data1,-1.746539,-0.239875,374,-0.812946
"(-1.75, -0.231]",data2,-3.268214,3.194451,374,0.002882
"(-0.231, 1.287]",data1,-0.230847,1.28566,488,0.417931
"(-0.231, 1.287]",data2,-3.393084,2.784019,488,0.017693
"(1.287, 2.806]",data1,1.2881,2.805536,98,1.679207
"(1.287, 2.806]",data2,-1.989063,2.923969,98,-0.062749


In [44]:
# keep in mind the same result could have been computed more simply with:
grouped.agg(["min", "max", "count", "mean"])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-3.274, -1.75]",-3.268083,-1.76893,40,-2.276343,-2.800089,3.238439,40,-0.043268
"(-1.75, -0.231]",-1.746539,-0.239875,374,-0.812946,-3.268214,3.194451,374,0.002882
"(-0.231, 1.287]",-0.230847,1.28566,488,0.417931,-3.393084,2.784019,488,0.017693
"(1.287, 2.806]",1.2881,2.805536,98,1.679207,-1.989063,2.923969,98,-0.062749


In [47]:
# these were equal-length buckets; to compute equal-size buckets based on sample quantiles, use pd.qcut
# pass 4 as the number of bucket compute sample quartiles, and pass labels=False to obtain just the quartile indices instead of intervals

quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
print(quartiles_samp.value_counts())
quartiles_samp.head()

data1
2    250
3    250
1    250
0    250
Name: count, dtype: int64


0    2
1    2
2    3
3    1
4    0
Name: data1, dtype: int64

In [48]:
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-3.268083,-0.659625,250,-1.284221
0,data2,-3.268214,3.238439,250,0.060014
1,data1,-0.659272,-0.038289,250,-0.341169
1,data2,-3.393084,3.038147,250,-0.06867
2,data1,-0.038103,0.648077,250,0.291132
2,data2,-3.242758,2.682023,250,0.048681
3,data1,0.650728,2.805536,250,1.227925
3,data2,-2.442275,2.923969,250,-0.032696
