## [ Apply: general split-apply-combine ]

- The most genral-purpose GroupBy method is apply.
- apply splits the object being manipulated into pieces, invokes the passed function on each piece, and then attemps to concatenate the pieces


In [2]:
import numpy as np 
import pandas as pd 

# suppose we want to select the top five tip_pct values by group
# first write a function that selects the rows with the largest values in a particular column

def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]

tips = pd.read_csv("examples/tips.csv")
tips["tip_pct"] = tips["tip"] / tips["total_bill"]

top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
232,11.61,3.39,No,Sat,Dinner,2,0.29199
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [3]:
# now if we group by smoker, say, and call apply with this functions, we get
tips.groupby("smoker").apply(top)

# first, the tips dataframe is split into groups based on the value of smoker.
# then the top function is called on each group, and the results of each function call are glued together using pd.concat, labeling the pieces with the group names
# the result therefore has a hierarchical index 

  tips.groupby("smoker").apply(top)


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


In [4]:
# if you pass a function to apply that takes other arguments or keywords, you can pass these after the function
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")

  tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


- beyond these basic usage mechanics, getting the most out of apply may require some creativity
- what occurs inside the function passed is up to you, it must either return a pandas object or a scalar value

In [5]:
# recall calling of describe on a GroupBy object
result = tips.groupby("smoker")["tip_pct"].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [6]:
result.unstack()

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [7]:
# inside GroupBy, when you invoke a method like describe, it is actually just a shortcut for
        # def f(group):
        #     return group.describe()
        # grouped.apply(f)

## [ Supressing the Group Keys ]

In [8]:
# In the preceding examples, you see that the resulting object has a hierarchical index formed from the group keys, along with the indexes of each piece of the original object. You can disable this by passing group_keys=False to groupby:

tips.groupby("smoker", group_keys=False).apply(top)

  tips.groupby("smoker", group_keys=False).apply(top)


Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.29199
149,7.51,2.0,No,Thur,Lunch,2,0.266312
51,10.29,2.6,No,Sun,Dinner,2,0.252672
185,20.69,5.0,No,Sun,Dinner,5,0.241663
88,24.71,5.85,No,Thur,Lunch,2,0.236746
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525


## [ Quantile and Bucket Analysis ]

- you may know, pandas has some tools, in particular pandas.cut
and pandas.qcut, for slicing data up into buckets with bins of yourchoosing, or by sample quantiles. 
- Combining these functions with groupby makes it convenient to
perform bucket or quantile analysis on a dataset. 

In [9]:
# Consider a simple random dataset and an equal-length bucket categorization using pandas.cut

frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()

Unnamed: 0,data1,data2
0,0.64767,2.488123
1,0.064214,0.918916
2,-1.20761,-0.14925
3,0.292443,-1.398457
4,0.046646,-0.249401


In [10]:
quartiles = pd.cut(frame["data1"], 4)
print(quartiles.value_counts())
quartiles.head(10)

data1
(-0.0507, 1.722]     485
(-1.823, -0.0507]    438
(1.722, 3.494]        42
(-3.603, -1.823]      35
Name: count, dtype: int64


0     (-0.0507, 1.722]
1     (-0.0507, 1.722]
2    (-1.823, -0.0507]
3     (-0.0507, 1.722]
4     (-0.0507, 1.722]
5    (-1.823, -0.0507]
6    (-1.823, -0.0507]
7    (-1.823, -0.0507]
8     (-0.0507, 1.722]
9     (-0.0507, 1.722]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.603, -1.823] < (-1.823, -0.0507] < (-0.0507, 1.722] < (1.722, 3.494]]

In [11]:
# the categorical object returned by cut can be passed directly to groupby
# so we could compute a set of group statistics for the quartiles, like so:
def get_stats(group):
    return pd.DataFrame({"min": group.min(), 
                         "max": group.max(),
                         "count": group.count(),
                         "mean": group.mean()})
grouped = frame.groupby(quartiles)

grouped.apply(get_stats)

  grouped = frame.groupby(quartiles)


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.603, -1.823]",data1,-3.595767,-1.851457,35,-2.145887
"(-3.603, -1.823]",data2,-2.261769,1.852846,35,-0.104617
"(-1.823, -0.0507]",data1,-1.801864,-0.051892,438,-0.724499
"(-1.823, -0.0507]",data2,-3.092459,2.925606,438,-0.07435
"(-0.0507, 1.722]",data1,-0.050638,1.706735,485,0.644914
"(-0.0507, 1.722]",data2,-3.149848,2.970795,485,0.041748
"(1.722, 3.494]",data1,1.731137,3.494272,42,2.209461
"(1.722, 3.494]",data2,-2.392325,1.806538,42,-0.252364


In [12]:
# keep in mind the same result could have been computed more simply with:
grouped.agg(["min", "max", "count", "mean"])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-3.603, -1.823]",-3.595767,-1.851457,35,-2.145887,-2.261769,1.852846,35,-0.104617
"(-1.823, -0.0507]",-1.801864,-0.051892,438,-0.724499,-3.092459,2.925606,438,-0.07435
"(-0.0507, 1.722]",-0.050638,1.706735,485,0.644914,-3.149848,2.970795,485,0.041748
"(1.722, 3.494]",1.731137,3.494272,42,2.209461,-2.392325,1.806538,42,-0.252364


In [13]:
# these were equal-length buckets; to compute equal-size buckets based on sample quantiles, use pd.qcut
# pass 4 as the number of bucket compute sample quartiles, and pass labels=False to obtain just the quartile indices instead of intervals

quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
print(quartiles_samp.value_counts())
quartiles_samp.head()

data1
2    250
0    250
1    250
3    250
Name: count, dtype: int64


0    2
1    2
2    0
3    2
4    2
Name: data1, dtype: int64

In [14]:
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-3.595767,-0.688346,250,-1.25725
0,data2,-3.092459,2.925606,250,-0.146397
1,data1,-0.688193,-0.011261,250,-0.315528
1,data2,-2.841397,2.841918,250,0.015664
2,data1,-0.009492,0.686931,250,0.319221
2,data2,-3.149848,2.808274,250,0.007964
3,data1,0.687142,3.494272,250,1.306133
3,data2,-2.917848,2.970795,250,0.016455


## [ Example: Filling Missing Values with Group-Specific Values]

In [15]:
# when cleaning up missing data, in some cases you will remove data observations using dropna, but in others you may want to fill in the null (NA) values using a fixed value or some value derived from the data. 
# fillna is the right tool to use; for example here

s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s

0         NaN
1   -1.795570
2         NaN
3    0.579004
4         NaN
5    0.112831
dtype: float64

In [16]:
s.fillna(s.mean())

0   -0.367912
1   -1.795570
2   -0.367912
3    0.579004
4   -0.367912
5    0.112831
dtype: float64

In [17]:
# suppose you need to fill value by vary by group.
# one way to do this is to group the data and use apply with a function that calls fillna on each data chunk

states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]

group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]

data = pd.Series(np.random.standard_normal(8), index=states)
data

Ohio         -0.552268
New York     -1.322172
Vermont       1.614061
Florida      -0.738684
Oregon       -1.071936
Nevada       -0.032148
California   -0.613296
Idaho         1.261020
dtype: float64

In [18]:
# set some values in the data to be missing
data[["Vermont", "Nevada", "Idaho"]] = np.nan
data

Ohio         -0.552268
New York     -1.322172
Vermont            NaN
Florida      -0.738684
Oregon       -1.071936
Nevada             NaN
California   -0.613296
Idaho              NaN
dtype: float64

In [19]:
data.groupby(group_key).size()  # counts total elements in each group

East    4
West    4
dtype: int64

In [20]:
data.groupby(group_key).count() # counts non-NA elements in each group

East    3
West    2
dtype: int64

In [21]:
data.groupby(group_key).mean()

East   -0.871041
West   -0.842616
dtype: float64

In [22]:
# we can fill the NA values using the group means, like so:
def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)

# group mean is the average value of a column within each group, created by the .groupby() operation in pandas

# why use group mean?
    # - to summarize large datsets by category
    # - to compare groups
    # - to find patterns or anomalies in data

East  Ohio         -0.552268
      New York     -1.322172
      Vermont      -0.871041
      Florida      -0.738684
West  Oregon       -1.071936
      Nevada       -0.842616
      California   -0.613296
      Idaho        -0.842616
dtype: float64

In [23]:
# in another case, you might have predefined fill values in your code that vary by group.
# since the groups have a name attribute set internally, we can use that

fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)

East  Ohio         -0.552268
      New York     -1.322172
      Vermont       0.500000
      Florida      -0.738684
West  Oregon       -1.071936
      Nevada       -1.000000
      California   -0.613296
      Idaho        -1.000000
dtype: float64

## [ Example: Random Sampling and Permutation ]

In [24]:
# suppose you wanted to draw a random sample (with or without replacement) from a large dataset.
# there are a number of ways to perform the "draws"
# here we use the sample method for Series

# construct a deck of playing cards
suits = ["H", "S", "C", "D"]    # hearts, space, clubs, diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_name = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]

cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_name)
deck = pd.Series(card_val, index=cards)

# now we have Series of length 52 whose index contains card names, and values are the one used in games

deck.head(13)

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [25]:
# drawing a hand of 5 cards from the deck could be written as 
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

# .sample() is used to randomly select rows(or columns) from a DataFrame or Series

10S    10
10C    10
9C      9
KH     10
AD      1
dtype: int64


#### Key Parameters

| Parameter | Description |
|----------|-------------|
| `n`      | Number of items to return. Example: `n=5` returns 5 random rows/items. |
| `frac`   | Fraction of items to return. Example: `frac=0.5` returns 50% of rows/items. |
| `replace` | If `True`, samples **with replacement** (can get duplicates). Default is `False`. |
| `weights` | Probabilities for sampling. Can pass a list or Series. |
| `random_state` | Sets the random seed for reproducibility. |


#### Why Use `.sample()`?

- Shuffle a dataset randomly (for testing or training).
- Select a random subset for analysis.
- Simulate draws from a deck or population.


In [26]:
# suppose you want two random cards from each suit
# because the suit is the last character of each card name, we can group based on this and use apply
def get_suit(card):
    return card[-1]     # get the last character of the string
deck.groupby(get_suit).apply(draw, n=2)

C  KC    10
   JC    10
D  KD    10
   5D     5
H  9H     9
   7H     7
S  QS    10
   3S     3
dtype: int64

In [27]:
# alternatively, we could pass group_keys=False to drop the outer suit index, leaving in just the selected cards
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

3C    3
7C    7
AD    1
8D    8
9H    9
8H    8
6S    6
3S    3
dtype: int64

## [ Example: Group Weighted Average and Correlation ]
under the split-apply-combine paradigm of groupby, operations between columns in a DataFrame or two Series, such as a group weighted average, are possible

In [28]:
# as example, take this dataset containing group keys, values, and some weights

df = pd.DataFrame({"category": ["a", "a", "a", "a",
                   "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df

Unnamed: 0,category,data,weights
0,a,-0.37608,0.878494
1,a,1.076977,0.853179
2,a,-0.445781,0.428226
3,a,1.14055,0.325894
4,b,-0.197188,0.424164
5,b,0.365192,0.087927
6,b,-2.296721,0.972606
7,b,0.184969,0.846372


In [29]:
# the weighted average by category would then be
grouped = df.groupby("category")

def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])
    # weighted average formula: 
        # = sigma(xi.wi) / sigma(wi)
        # xi = data values
        # wi = weights

grouped.apply(get_wavg)     # calling .apply(), automatically passes each group of data to the function. don't need to explicitly provide an argument to the function

  grouped.apply(get_wavg)     # calling .apply(), automatically passes each group of data to the function. don't need to explicitly provide an argument to the function


category
a    0.309468
b   -0.913221
dtype: float64

In [30]:
# another example
close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0)

close_px.info()
# the dataframe info() method here is a convenient way to get an overview of the contents of a dataframe

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [31]:
close_px.tail(4)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [32]:
# create a function that computes the pair-wise correlation of each column with the "SPX" column
def spx_corr(group):
    return group.corrwith(group["SPX"])

# compute percent change on close_px using pct_change
rets = close_px.pct_change().dropna()

# group these percent changes by year, which can be extracted from each row label with a one-line function that returns the year attribute of each datetime label

def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

# jhant kuch samajh nhi aaya

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [33]:
# we could also compute intercolumn correlations.
# here we compute the annual correlation between Apple and Microsoft
def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])

by_year.apply(corr_aapl_msft)

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## [ Group-Wise Linear Regression ]
we can use groupby to perform more complex group-wise statistical analysis, as long as the function returns a pandas object or scalar value.

In [34]:
# For example, I can define the following regress function (using the statsmodels econometrics library), which executes an ordinary least squares (OLS) regression on each chunk of data

import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

by_year.apply(regress, yvar="AAPL", xvars=["SPX"])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514
