## [ Apply: general split-apply-combine ]

- The most genral-purpose GroupBy method is apply.
- apply splits the object being manipulated into pieces, invokes the passed function on each piece, and then attemps to concatenate the pieces


In [26]:
import numpy as np 
import pandas as pd 

# suppose we want to select the top five tip_pct values by group
# first write a function that selects the rows with the largest values in a particular column

def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]

tips = pd.read_csv("examples/tips.csv")
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [27]:
# now if we group by smoker, say, and call apply with this functions, we get
tips.groupby("smoker").apply(top)

# first, the tips dataframe is split into groups based on the value of smoker.
# then the top function is called on each group, and the results of each function call are glued together using pd.concat, labeling the pieces with the group names
# the result therefore has a hierarchical index 

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [28]:
# if you pass a function to apply that takes other arguments or keywords, you can pass these after the function
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

  tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


- beyond these basic usage mechanics, getting the most out of apply may require some creativity
- what occurs inside the function passed is up to you, it must either return a pandas object or a scalar value

In [29]:
# recall calling of describe on a GroupBy object
result = tips.groupby("smoker")["tip_pct"].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [30]:
result.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [31]:
# inside GroupBy, when you invoke a method like describe, it is actually just a shortcut for
        # def f(group):
        #     return group.describe()
        # grouped.apply(f)

## [ Supressing the Group Keys ]

In [32]:
# In the preceding examples, you see that the resulting object has a hierarchical index formed from the group keys, along with the indexes of each piece of the original object. You can disable this by passing group_keys=False to groupby:

tips.groupby("smoker", group_keys=False).apply(top)

  tips.groupby("smoker", group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


## [ Quantile and Bucket Analysis ]

- you may know, pandas has some tools, in particular pandas.cut
and pandas.qcut, for slicing data up into buckets with bins of yourchoosing, or by sample quantiles. 
- Combining these functions with groupby makes it convenient to
perform bucket or quantile analysis on a dataset. 

In [33]:
# Consider a simple random dataset and an equal-length bucket categorization using pandas.cut

frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,-1.738719,0.629109
1,0.293958,2.332267
2,0.850296,1.136649
3,1.674595,-1.98601
4,0.719073,-0.90936


In [34]:
quartiles = pd.cut(frame["data1"], 4)
print(quartiles.value_counts())
quartiles.head(10)

data1
(-1.26, 0.381]     554
(0.381, 2.021]     323
(-2.907, -1.26]     95
(2.021, 3.662]      28
Name: count, dtype: int64


0    (-2.907, -1.26]
1     (-1.26, 0.381]
2     (0.381, 2.021]
3     (0.381, 2.021]
4     (0.381, 2.021]
5     (0.381, 2.021]
6     (-1.26, 0.381]
7     (2.021, 3.662]
8     (-1.26, 0.381]
9     (0.381, 2.021]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.907, -1.26] < (-1.26, 0.381] < (0.381, 2.021] < (2.021, 3.662]]

In [35]:
# the categorical object returned by cut can be passed directly to groupby
# so we could compute a set of group statistics for the quartiles, like so:
def get_stats(group):
    return pd.DataFrame({"min": group.min(), 
                         "max": group.max(),
                         "count": group.count(),
                         "mean": group.mean()})
grouped = frame.groupby(quartiles)

grouped.apply(get_stats)

  grouped = frame.groupby(quartiles)


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-2.907, -1.26]",data1,-2.900017,-1.262388,95,-1.761361
"(-2.907, -1.26]",data2,-3.014594,3.476083,95,0.221319
"(-1.26, 0.381]",data1,-1.257513,0.380482,554,-0.302739
"(-1.26, 0.381]",data2,-2.604739,3.576873,554,0.027645
"(0.381, 2.021]",data1,0.382477,2.001394,323,0.94887
"(0.381, 2.021]",data2,-2.968587,3.061215,323,0.029135
"(2.021, 3.662]",data1,2.027436,3.661777,28,2.369938
"(2.021, 3.662]",data2,-1.690952,1.564011,28,-0.039423


In [36]:
# keep in mind the same result could have been computed more simply with:
grouped.agg(["min", "max", "count", "mean"])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-2.907, -1.26]",-2.900017,-1.262388,95,-1.761361,-3.014594,3.476083,95,0.221319
"(-1.26, 0.381]",-1.257513,0.380482,554,-0.302739,-2.604739,3.576873,554,0.027645
"(0.381, 2.021]",0.382477,2.001394,323,0.94887,-2.968587,3.061215,323,0.029135
"(2.021, 3.662]",2.027436,3.661777,28,2.369938,-1.690952,1.564011,28,-0.039423


In [37]:
# these were equal-length buckets; to compute equal-size buckets based on sample quantiles, use pd.qcut
# pass 4 as the number of bucket compute sample quartiles, and pass labels=False to obtain just the quartile indices instead of intervals

quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
print(quartiles_samp.value_counts())
quartiles_samp.head()

data1
0    250
2    250
3    250
1    250
Name: count, dtype: int64


0    0
1    2
2    3
3    3
4    3
Name: data1, dtype: int64

In [38]:
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-2.900017,-0.551949,250,-1.216123
0,data2,-3.014594,3.476083,250,0.049951
1,data1,-0.55055,0.038096,250,-0.239051
1,data2,-2.604739,3.576873,250,0.082939
2,data1,0.039066,0.674501,250,0.328874
2,data2,-2.968587,3.097819,250,0.045352
3,data1,0.676977,3.661777,250,1.277487
3,data2,-2.21559,3.061215,250,0.000347


## [ Example: Filling Missing Values with Group-Specific Values]

In [39]:
# when cleaning up missing data, in some cases you will remove data observations using dropna, but in others you may want to fill in the null (NA) values using a fixed value or some value derived from the data. 
# fillna is the right tool to use; for example here

s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s

0         NaN
1   -0.016154
2         NaN
3   -1.440432
4         NaN
5    0.395233
dtype: float64

In [40]:
s.fillna(s.mean())

0   -0.353784
1   -0.016154
2   -0.353784
3   -1.440432
4   -0.353784
5    0.395233
dtype: float64

In [41]:
# suppose you need to fill value by vary by group.
# one way to do this is to group the data and use apply with a function that calls fillna on each data chunk

states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]

group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]

data = pd.Series(np.random.standard_normal(8), index=states)
data

Ohio          1.902740
New York     -1.554173
Vermont       1.240960
Florida       0.370891
Oregon       -1.529171
Nevada        0.560894
California    0.696777
Idaho         0.672355
dtype: float64

In [42]:
# set some values in the data to be missing
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data

Ohio          1.902740
New York     -1.554173
Vermont            NaN
Florida       0.370891
Oregon       -1.529171
Nevada             NaN
California    0.696777
Idaho              NaN
dtype: float64

In [43]:
data.groupby(group_key).size()  # counts total elements in each group

East    4
West    4
dtype: int64

In [44]:
data.groupby(group_key).count() # counts non-NA elements in each group

East    3
West    2
dtype: int64

In [45]:
data.groupby(group_key).mean()

East    0.239819
West   -0.416197
dtype: float64

In [46]:
# we can fill the NA values using the group means, like so:
def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

# group mean is the average value of a column within each group, created by the .groupby() operation in pandas

# why use group mean?
    # - to summarize large datsets by category
    # - to compare groups
    # - to find patterns or anomalies in data

East  Ohio          1.902740
      New York     -1.554173
      Vermont       0.239819
      Florida       0.370891
West  Oregon       -1.529171
      Nevada       -0.416197
      California    0.696777
      Idaho        -0.416197
dtype: float64

In [47]:
# in another case, you might have predefined fill values in your code that vary by group.
# since the groups have a name attribute set internally, we can use that

fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

East  Ohio          1.902740
      New York     -1.554173
      Vermont       0.500000
      Florida       0.370891
West  Oregon       -1.529171
      Nevada       -1.000000
      California    0.696777
      Idaho        -1.000000
dtype: float64

## [ Example: Random Sampling and Permutation ]

In [51]:
# suppose you wanted to draw a random sample (with or without replacement) from a large dataset.
# there are a number of ways to perform the "draws"
# here we use the sample method for Series

# construct a deck of playing cards
suits = ["H", "S", "C", "D"]    # hearts, space, clubs, diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_name = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]

cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_name)
deck = pd.Series(card_val, index=cards)

# now we have Series of length 52 whose index contains card names, and values are the one used in games

deck.head(13)

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [52]:
# drawing a hand of 5 cards from the deck could be written as 
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

# .sample() is used to randomly select rows(or columns) from a DataFrame or Series

7C     7
3D     3
JC    10
4H     4
9C     9
dtype: int64


#### Key Parameters

| Parameter | Description |
|----------|-------------|
| `n`      | Number of items to return. Example: `n=5` returns 5 random rows/items. |
| `frac`   | Fraction of items to return. Example: `frac=0.5` returns 50% of rows/items. |
| `replace` | If `True`, samples **with replacement** (can get duplicates). Default is `False`. |
| `weights` | Probabilities for sampling. Can pass a list or Series. |
| `random_state` | Sets the random seed for reproducibility. |


#### Why Use `.sample()`?

- Shuffle a dataset randomly (for testing or training).
- Select a random subset for analysis.
- Simulate draws from a deck or population.


In [54]:
# suppose you want two random cards from each suit
# because the suit is the last character of each card name, we can group based on this and use apply
def get_suit(card):
    return card[-1]     # get the last character of the string
deck.groupby(get_suit).apply(draw, n=2)

C  3C     3
   4C     4
D  JD    10
   4D     4
H  5H     5
   KH    10
S  9S     9
   4S     4
dtype: int64

In [55]:
# alternatively, we could pass group_keys=False to drop the outer suit index, leaving in just the selected cards
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

6C     6
5C     5
AD     1
7D     7
KH    10
3H     3
8S     8
2S     2
dtype: int64