# Fast groupby operations

- hide: true
- category: [python, pandas]

In [130]:
from imports import *

%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

SAMPLE = 'X77'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [133]:
df = pd.read_parquet(os.path.join(config.TEMPDIR, f'data_{SAMPLE}.parquet'))
helpers.info(df)

Users:      1,540
Rows:   4,385,858
Cols:          23


## Fast groupby operations

In [137]:
%%time

data = (
    df[['user_id', 'transaction_date', 'tag']].copy()
    .set_index('user_id')
    .assign(ym = lambda df: df.transaction_date.dt.to_period('M'))
)
tags = ['earnings', 'pensions', 'benefits', 'other income']
income = data[data.tag.str.match('|'.join(tags))]

tot_months = data.groupby('user_id').ym.nunique()
inc_months = income.groupby('user_id').ym.nunique()

cond = (inc_months / tot_months) > (2/3)
usrs = cond[cond].index
result = df[df.user_id.isin(usrs)]

CPU times: user 16.3 s, sys: 1.49 s, total: 17.8 s
Wall time: 18.3 s


In [135]:
%%time

def helper(df):    
    data = (
        df[['user_id', 'transaction_date', 'tag']].copy()
        .set_index('user_id')
        .assign(ym = lambda df: df.transaction_date.dt.to_period('M'))
    )
    tags = ['earnings', 'pensions', 'benefits', 'other income']
    income = data[data.tag.str.match('|'.join(tags))]
    
    tot_months = data.ym.nunique()
    inc_months = income.ym.nunique()
    
    return (inc_months / tot_months) > (2/3)

result2 = df.groupby('user_id').filter(helper)

CPU times: user 8.24 s, sys: 594 ms, total: 8.84 s
Wall time: 8.89 s


In [136]:
%%time

def helper(g):        
    tot_months = g.ym.nunique()
    inc_months = g[g.inc].ym.nunique()    
    return (inc_months / tot_months) > (2/3)

tags = ['earnings', 'pensions', 'benefits', 'other income']

data = (
    df[['user_id', 'transaction_date', 'tag']].copy()
    .assign(ym = lambda df: df.transaction_date.dt.to_period('M'))
    .assign(inc = lambda df: df.tag.str.match('|'.join(tags)))
)
usrs = data.groupby('user_id').filter(helper).user_id.unique()
result3 = df[df.user_id.isin(usrs)]
result3

CPU times: user 4.58 s, sys: 381 ms, total: 4.96 s
Wall time: 5.01 s


Unnamed: 0,user_id,transaction_date,amount,transaction_description,merchant_name,auto_tag,tag,manual_tag,gender,latest_balance,salary_range,credit_debit,account_last_refreshed,year_of_birth,transaction_id,user_registration_date,bank,up_tag,account_type,account_created,account_id,postcode,merchant_business_line
4734,6077,2012-08-01,1.00,planned o/d fee,no merchant,banking charges,banking charges,no tag,m,783.65,10k to 20k,debit,2020-05-29 07:39:00,1975.0,1072703,2012-10-21,halifax personal banking,bank charges,current,2012-10-22,20035,bb12 7,account provider
4735,6077,2012-08-01,-5.00,reward (net),no merchant,rewards/cashback,rewards/cashback,no tag,m,783.65,10k to 20k,credit,2020-05-29 07:39:00,1975.0,1072702,2012-10-21,halifax personal banking,rewards/cashback,current,2012-10-22,20035,bb12 7,account provider
4736,6077,2012-08-08,200.00,<mdbremoved> xxxxxxxxxxxxxx4580 08aug12 21:53,no merchant,transfers,transfers,no tag,m,783.65,10k to 20k,debit,2020-05-29 07:39:00,1975.0,1072701,2012-10-21,halifax personal banking,no tag,current,2012-10-22,20035,bb12 7,non merchant mbl
4737,6077,2012-08-15,1550.00,<mdbremoved> xxxxxxxxxxxxxx1075 15aug12 12:53,no merchant,transfers,transfers,no tag,m,783.65,10k to 20k,debit,2020-05-29 07:39:00,1975.0,1072699,2012-10-21,halifax personal banking,no tag,current,2012-10-22,20035,bb12 7,non merchant mbl
4738,6077,2012-08-15,-1632.37,<mdbremoved>,no merchant,salary or wages - main,salary or wages - main,no tag,m,783.65,10k to 20k,credit,2020-05-29 07:39:00,1975.0,1072700,2012-10-21,halifax personal banking,no tag,current,2012-10-22,20035,bb12 7,no merchant business line
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90316,88277,2015-11-23,-60.00,"call ref.no. 0000 , from a/c xxxxxx41 - otr",no merchant,transfers,transfers,no tag,f,0.00,,credit,2015-11-25 00:00:00,1987.0,107753283,2014-07-06,natwest bank,current account,current,2014-07-06,8950,le11 1,personal
90317,88277,2015-11-23,81.65,"bmach 21nov, charge 1.65 - atm",no merchant,cash,cash,no tag,f,0.00,,debit,2015-11-25 00:00:00,1987.0,107753281,2014-07-06,natwest bank,cash,current,2014-07-06,8950,le11 1,personal
90318,88277,2015-11-24,49.27,<mdbremoved> balance transfer,no merchant,transfers,transfers,no tag,f,0.00,,debit,2015-11-25 00:00:00,1987.0,189202573,2014-07-06,natwest bank,transfers,current,2014-07-06,8950,le11 1,personal
90319,88277,2015-11-24,-61.45,child tax credit - bac,no merchant,family benefits,benefits,no tag,f,0.00,,credit,2015-11-26 00:00:00,1987.0,107912211,2014-07-06,natwest bank,family benefits,current,2014-07-06,8947,le11 1,public sector
