# Fast groupby operations

- hide: true
- category: [python, pandas]

In [3]:
from imports import *

%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

In [2]:
path = '/Users/fgu/tmp/data_777.parquet'
df = pd.read_parquet(path)
print(df.shape)
df.head()

(115197, 21)


Unnamed: 0,user_id,transaction_date,amount,transaction_description,merchant_name,tag,transaction_id,user_registration_date,account_last_refreshed,bank,latest_balance,year_of_birth,salary_range,gender,ym,credit_debit,account_id,postcode,account_created,account_type,merchant_business_line
6553,14777,2012-10-10,5.0,new southern railw cd 7715,southern rail,public transport,2994290,2013-01-07,2015-01-13 05:19:00,lloyds bank,729.59,1986.0,,m,2012-10,debit,247061,sw11 5,2013-01-07,current,southern rail
6554,14777,2012-10-10,11.1,new southern railw cd 7715,southern rail,public transport,2994291,2013-01-07,2015-01-13 05:19:00,lloyds bank,729.59,1986.0,,m,2012-10,debit,247061,sw11 5,2013-01-07,current,southern rail
6555,14777,2012-10-10,26.91,ticketscript*14064 cd 7715,no merchant,public transport,2994292,2013-01-07,2015-01-13 05:19:00,lloyds bank,729.59,1986.0,,m,2012-10,debit,247061,sw11 5,2013-01-07,current,unknown merchant
6556,14777,2012-10-10,83.2,selfserve ticket cd 2412,south west trains,public transport,2994293,2013-01-07,2015-01-13 05:19:00,lloyds bank,729.59,1986.0,,m,2012-10,debit,247061,sw11 5,2013-01-07,current,south west trains
6557,14777,2012-10-11,19.0,liv*livingsocial cd 7715,no merchant,dining and drinking,2994289,2013-01-07,2015-01-13 05:19:00,lloyds bank,729.59,1986.0,,m,2012-10,debit,247061,sw11 5,2013-01-07,current,unknown merchant


## Applications

In [None]:
def min_txns_and_spend(df, min_txns=10, min_spend=300):
    """At least 5 monthly debits totalling GBP200.
    Drops first and last month for each user due to possible partial data.
    """
    cols = ['user_id', 'transaction_date', 'amount']
    spend = df.loc[df.amount > 0, cols]
    spend = spend.set_index('transaction_date')
    spend['ym'] = df.transaction_date.dt.to_period('M')

    g = spend.groupby('user_id')
    spend['first_month'] = g.ym.transform(min)
    spend['last_month'] = g.ym.transform(max)
    spend = spend[(spend.ym != spend.first_month)
                  & (spend.ym != spend.last_month)]

    mins = (spend.groupby('user_id')
            .resample('M').amount.agg(['count', 'sum'])
            .groupby('user_id').min())
    mask = (mins >= (min_txns, min_spend)).all(1)
    users = mins[mask].index
    return df[df.user_id.isin(users)]

min_txns_and_spend(df)

## Basics

### Boolean comparisons

In [25]:
df = sns.load_dataset('iris')

In [23]:
%timeit df.sepal_width > 3

108 µs ± 4.92 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
%timeit df.sepal_width.values > 0

6.03 µs ± 225 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### Group means

From https://cmdlinetips.com/2019/05/how-to-implement-pandas-groupby-operation-with-numpy/



In [17]:
df = sns.load_dataset('iris')

In [18]:
%timeit df.groupby('species').sepal_length.mean()

481 µs ± 11.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
%%timeit

spec = df.species.values
sl = df.sepal_length.values
groups = df.species.unique()
[(group, np.mean(sl[spec == group])) for group in groups]

111 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
