# Fast groupby operations

- hide: true
- category: [python, pandas]

In [50]:
from imports import *

%config InlineBackend.figure_format = 'retina'
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
path = '/Users/fgu/tmp/mdb/data_777.parquet'
df = pd.read_parquet(path)
print(df.shape)
df.head()

(115197, 21)


Unnamed: 0,user_id,transaction_date,amount,transaction_description,merchant_name,tag,gender,transaction_id,bank,account_created,account_type,postcode,merchant_business_line,ym,account_id,year_of_birth,user_registration_date,salary_range,credit_debit,account_last_refreshed,latest_balance
6553,14777,2012-10-10,5.0,new southern railw cd 7715,southern rail,public transport,m,2994290,lloyds bank,2013-01-07,current,sw11 5,southern rail,201210,247061,1986.0,2013-01-07,,debit,2015-01-13 05:19:00,729.59
6554,14777,2012-10-10,11.1,new southern railw cd 7715,southern rail,public transport,m,2994291,lloyds bank,2013-01-07,current,sw11 5,southern rail,201210,247061,1986.0,2013-01-07,,debit,2015-01-13 05:19:00,729.59
6555,14777,2012-10-10,26.91,ticketscript*14064 cd 7715,no merchant,public transport,m,2994292,lloyds bank,2013-01-07,current,sw11 5,unknown merchant,201210,247061,1986.0,2013-01-07,,debit,2015-01-13 05:19:00,729.59
6556,14777,2012-10-10,83.2,selfserve ticket cd 2412,south west trains,public transport,m,2994293,lloyds bank,2013-01-07,current,sw11 5,south west trains,201210,247061,1986.0,2013-01-07,,debit,2015-01-13 05:19:00,729.59
6557,14777,2012-10-11,19.0,liv*livingsocial cd 7715,no merchant,dining and drinking,m,2994289,lloyds bank,2013-01-07,current,sw11 5,unknown merchant,201210,247061,1986.0,2013-01-07,,debit,2015-01-13 05:19:00,729.59


## Applications

Baseline function to filter out users with minumum number of transactions and spend in each month.

In [52]:
def min_txns_and_spend(df, min_txns=10, min_spend=300):
    """At least 5 monthly debits totalling GBP200.
    Drops first and last month for each user due to possible partial data.
    """
    cols = ['user_id', 'transaction_date', 'amount']
    spend = df.loc[df.amount > 0, cols]
    spend = spend.set_index('transaction_date')
    spend['ym'] = df.transaction_date.dt.to_period('M')

    g = spend.groupby('user_id')
    spend['first_month'] = g.ym.transform(min)
    spend['last_month'] = g.ym.transform(max)
    spend = spend[(spend.ym != spend.first_month)
                  & (spend.ym != spend.last_month)]

    mins = (spend
            .groupby('user_id')
            .resample('M')
            .amount
            .agg(['count', 'sum'])
            .groupby('user_id')
            .min())
    mask = (mins >= (min_txns, min_spend)).all(1)
    users = mins[mask].index
    return df[df.user_id.isin(users)]

%timeit min_txns_and_spend(df)

500 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%lprun -f min_txns_and_spend min_txns_and_spend(df)

Timer unit: 1e-06 s

Total time: 0.931279 s
File: <ipython-input-52-2d7d76d03c1f>
Function: min_txns_and_spend at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def min_txns_and_spend(df, min_txns=10, min_spend=300):
     2                                               """At least 5 monthly debits totalling GBP200.
     3                                               Drops first and last month for each user due to possible partial data.
     4                                               """
     5         1          2.0      2.0      0.0      cols = ['user_id', 'transaction_date', 'amount']
     6         1      15143.0  15143.0      1.6      spend = df.loc[df.amount > 0, cols]
     7         1       1364.0   1364.0      0.1      spend = spend.set_index('transaction_date')
     8         1     442903.0 442903.0     47.6      spend['ym'] = df.transaction_date.dt.to_period('M')
     9                              

In [58]:
def fast(df):
    """
    At least 5 monthly debits totalling GBP200.
    Drops first and last month for each user due to possible partial data.
    """
    data = df[['user_id', 'ym', 'amount']]

    # keep debits
    data = data[data.amount > 0]

    # remove first and last month from each user
    g = data.groupby('user_id')
    first_month = g.ym.transform(min)
    last_month = g.ym.transform(max)
    data = data[(data.ym > first_month) & (data.ym < last_month)]

    # calculate min spend and count per month per user
    g = data.groupby(['user_id', 'ym']).amount
    spend = g.sum()
    count = g.size()
    min_spend = spend.groupby('user_id').min()
    min_count = count.groupby('user_id').min()

    # keep users
    users = (min_count >= 10) & (min_spend >= 200)
    users = users[users].index

    return df[df.user_id.isin(users)]

%lprun -f fast fast(df)

Timer unit: 1e-06 s

Total time: 0.049063 s
File: <ipython-input-58-15b18e57e686>
Function: fast at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def fast(df):
     2                                               """
     3                                               At least 5 monthly debits totalling GBP200.
     4                                               Drops first and last month for each user due to possible partial data.
     5                                               """
     6         1       3707.0   3707.0      7.6      data = df[['user_id', 'ym', 'amount']]
     7                                           
     8                                               # keep debits
     9         1       4541.0   4541.0      9.3      data = data[data.amount > 0]
    10                                           
    11                                               # remove first and last month from ea

In [68]:
# look at https://medium.com/@aivinsolatorio/optimizing-pandas-groupby-50x-using-numpy-to-speedup-an-agent-based-model-a-story-of-8b0d25614915

# look at: https://github.com/mm-mansour/Fast-Pandas

def fast(df):
    """
    At least 5 monthly debits totalling GBP200.
    Drops first and last month for each user due to possible partial data.
    """
    data = df[['user_id', 'ym', 'amount']]

    # keep debits
    data = data[data.amount > 0]

    # remove first and last month from each user
    g = data.groupby('user_id')
    first_month = g.ym.transform(min)
    last_month = g.ym.transform(max)
    data = data[(data.ym > first_month) & (data.ym < last_month)]

    # calculate min spend and count per month per user
    g = data.groupby(['user_id', 'ym']).amount
    spend = g.sum()
    count = g.size()
    min_spend = spend.groupby('user_id').min()
    min_count = count.groupby('user_id').min()

    mask = (min_count >= 10) & (min_spend >= 200)

    return mask

fast(df)[:3]

user_id
14777    True
50777    True
52777    True
Name: amount, dtype: bool

## Basics

### Boolean comparisons

In [66]:
df = sns.load_dataset('iris')

In [67]:
%timeit df.sepal_width > 3

105 µs ± 2.61 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [68]:
%timeit df.sepal_width.values > 0

6.11 µs ± 181 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Given the above, the below is rather surprising:

In [69]:
%timeit df[df.sepal_width > 3]

335 µs ± 23.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [72]:
%timeit df[df.sepal_width.values > 3]

148 µs ± 6.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### Group means

From https://cmdlinetips.com/2019/05/how-to-implement-pandas-groupby-operation-with-numpy/



In [17]:
df = sns.load_dataset('iris')

In [18]:
%timeit df.groupby('species').sepal_length.mean()

481 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
%%timeit

spec = df.species.values
sl = df.sepal_length.values
groups = df.species.unique()
[(group, np.mean(sl[spec == group])) for group in groups]

111 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
