## [ Apply: general split-apply-combine ]

- The most genral-purpose GroupBy method is apply.
- apply splits the object being manipulated into pieces, invokes the passed function on each piece, and then attemps to concatenate the pieces


In [61]:
import numpy as np 
import pandas as pd 

# suppose we want to select the top five tip_pct values by group
# first write a function that selects the rows with the largest values in a particular column

def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]

tips = pd.read_csv("examples/tips.csv")
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [62]:
# now if we group by smoker, say, and call apply with this functions, we get
tips.groupby("smoker").apply(top)

# first, the tips dataframe is split into groups based on the value of smoker.
# then the top function is called on each group, and the results of each function call are glued together using pd.concat, labeling the pieces with the group names
# the result therefore has a hierarchical index 

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [63]:
# if you pass a function to apply that takes other arguments or keywords, you can pass these after the function
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

  tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


- beyond these basic usage mechanics, getting the most out of apply may require some creativity
- what occurs inside the function passed is up to you, it must either return a pandas object or a scalar value

In [64]:
# recall calling of describe on a GroupBy object
result = tips.groupby("smoker")["tip_pct"].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [65]:
result.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [66]:
# inside GroupBy, when you invoke a method like describe, it is actually just a shortcut for
        # def f(group):
        #     return group.describe()
        # grouped.apply(f)

## [ Supressing the Group Keys ]

In [67]:
# In the preceding examples, you see that the resulting object has a hierarchical index formed from the group keys, along with the indexes of each piece of the original object. You can disable this by passing group_keys=False to groupby:

tips.groupby("smoker", group_keys=False).apply(top)

  tips.groupby("smoker", group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


## [ Quantile and Bucket Analysis ]

- you may know, pandas has some tools, in particular pandas.cut
and pandas.qcut, for slicing data up into buckets with bins of yourchoosing, or by sample quantiles. 
- Combining these functions with groupby makes it convenient to
perform bucket or quantile analysis on a dataset. 

In [68]:
# Consider a simple random dataset and an equal-length bucket categorization using pandas.cut

frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,0.466817,1.691316
1,1.050536,0.162481
2,0.48837,-1.387649
3,0.513613,-0.94461
4,-0.559092,-1.870416


In [69]:
quartiles = pd.cut(frame["data1"], 4)
print(quartiles.value_counts())
quartiles.head(10)

data1
(-0.0892, 1.474]     458
(-1.652, -0.0892]    416
(1.474, 3.037]        81
(-3.222, -1.652]      45
Name: count, dtype: int64


0     (-0.0892, 1.474]
1     (-0.0892, 1.474]
2     (-0.0892, 1.474]
3     (-0.0892, 1.474]
4    (-1.652, -0.0892]
5    (-1.652, -0.0892]
6    (-1.652, -0.0892]
7     (-0.0892, 1.474]
8     (-0.0892, 1.474]
9    (-1.652, -0.0892]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.222, -1.652] < (-1.652, -0.0892] < (-0.0892, 1.474] < (1.474, 3.037]]

In [70]:
# the categorical object returned by cut can be passed directly to groupby
# so we could compute a set of group statistics for the quartiles, like so:
def get_stats(group):
    return pd.DataFrame({"min": group.min(), 
                         "max": group.max(),
                         "count": group.count(),
                         "mean": group.mean()})
grouped = frame.groupby(quartiles)

grouped.apply(get_stats)

  grouped = frame.groupby(quartiles)


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.222, -1.652]",data1,-3.21548,-1.655377,45,-2.048852
"(-3.222, -1.652]",data2,-2.885548,2.147741,45,-0.183837
"(-1.652, -0.0892]",data1,-1.624623,-0.090682,416,-0.745429
"(-1.652, -0.0892]",data2,-3.413402,3.755253,416,0.056854
"(-0.0892, 1.474]",data1,-0.084528,1.471313,458,0.570076
"(-0.0892, 1.474]",data2,-3.045723,3.308959,458,-0.064725
"(1.474, 3.037]",data1,1.475302,3.037084,81,1.938774
"(1.474, 3.037]",data2,-2.095564,2.142299,81,0.127808


In [71]:
# keep in mind the same result could have been computed more simply with:
grouped.agg(["min", "max", "count", "mean"])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-3.222, -1.652]",-3.21548,-1.655377,45,-2.048852,-2.885548,2.147741,45,-0.183837
"(-1.652, -0.0892]",-1.624623,-0.090682,416,-0.745429,-3.413402,3.755253,416,0.056854
"(-0.0892, 1.474]",-0.084528,1.471313,458,0.570076,-3.045723,3.308959,458,-0.064725
"(1.474, 3.037]",1.475302,3.037084,81,1.938774,-2.095564,2.142299,81,0.127808


In [72]:
# these were equal-length buckets; to compute equal-size buckets based on sample quantiles, use pd.qcut
# pass 4 as the number of bucket compute sample quartiles, and pass labels=False to obtain just the quartile indices instead of intervals

quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
print(quartiles_samp.value_counts())
quartiles_samp.head()

data1
2    250
3    250
1    250
0    250
Name: count, dtype: int64


0    2
1    3
2    2
3    2
4    1
Name: data1, dtype: int64

In [73]:
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-3.21548,-0.70094,250,-1.267522
0,data2,-2.885548,2.766092,250,-0.052972
1,data1,-0.698023,0.001124,250,-0.34859
1,data2,-3.413402,3.755253,250,0.138326
2,data1,0.005012,0.688816,250,0.351996
2,data2,-2.592583,3.308959,250,-0.005815
3,data1,0.692266,3.037084,250,1.327471
3,data2,-3.045723,2.142299,250,-0.095192


## [ Example: Filling Missing Values with Group-Specific Values]

In [74]:
# when cleaning up missing data, in some cases you will remove data observations using dropna, but in others you may want to fill in the null (NA) values using a fixed value or some value derived from the data. 
# fillna is the right tool to use; for example here

s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s

0         NaN
1   -1.134430
2         NaN
3    0.756497
4         NaN
5    1.613966
dtype: float64

In [75]:
s.fillna(s.mean())

0    0.412011
1   -1.134430
2    0.412011
3    0.756497
4    0.412011
5    1.613966
dtype: float64

In [76]:
# suppose you need to fill value by vary by group.
# one way to do this is to group the data and use apply with a function that calls fillna on each data chunk

states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]

group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]

data = pd.Series(np.random.standard_normal(8), index=states)
data

Ohio          0.869662
New York     -1.912780
Vermont       0.139865
Florida       0.174899
Oregon        1.131668
Nevada        0.269421
California   -0.244038
Idaho         0.059524
dtype: float64

In [77]:
# set some values in the data to be missing
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data

Ohio          0.869662
New York     -1.912780
Vermont            NaN
Florida       0.174899
Oregon        1.131668
Nevada             NaN
California   -0.244038
Idaho              NaN
dtype: float64

In [78]:
data.groupby(group_key).size()  # counts total elements in each group

East    4
West    4
dtype: int64

In [79]:
data.groupby(group_key).count() # counts non-NA elements in each group

East    3
West    2
dtype: int64

In [80]:
data.groupby(group_key).mean()

East   -0.289406
West    0.443815
dtype: float64

In [81]:
# we can fill the NA values using the group means, like so:
def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

# group mean is the average value of a column within each group, created by the .groupby() operation in pandas

# why use group mean?
    # - to summarize large datsets by category
    # - to compare groups
    # - to find patterns or anomalies in data

East  Ohio          0.869662
      New York     -1.912780
      Vermont      -0.289406
      Florida       0.174899
West  Oregon        1.131668
      Nevada        0.443815
      California   -0.244038
      Idaho         0.443815
dtype: float64

In [83]:
# in another case, you might have predefined fill values in your code that vary by group.
# since the groups have a name attribute set internally, we can use that

fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

East  Ohio          0.869662
      New York     -1.912780
      Vermont       0.500000
      Florida       0.174899
West  Oregon        1.131668
      Nevada       -1.000000
      California   -0.244038
      Idaho        -1.000000
dtype: float64