Notebook purpose:

- Explore whether we can analyse effect of MDB adoption

In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import scipy
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.aggregators as ag
import entropy.data.cleaners as cl
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.data.validators as vl
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2
%matplotlib widget

fs = s3fs.S3FileSystem(profile="3di")

In [3]:
df = hd.read_txn_data("X77")
hd.inspect(df)

(6,627,663, 35)


Unnamed: 0,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,birth_year,data_warehouse_date_created,data_warehouse_date_last_updated,id,is_debit,is_female,is_sa_flow,is_salary_pmt,is_urban,latest_balance,lsoa,merchant_business_line,msoa,postcode,region_name,salary_range,tag_auto,tag_manual,tag_spend,tag_up,updated_flag,ym
0,2012-08-01,77,12.12,tv licence mbp - d/d,tv licensing,spend,communication,2010-07-10,2012-10-26,259583,2015-09-11,natwest bank,current,1945.0,2014-07-18,2015-03-19,1212601,True,0.0,False,False,1.0,,e01002984,tv licensing,e02000609,kt3 5,london,,tv licence,no tag,"entertainment, tv, media",tv licence,u,2012-08
1,2012-08-01,77,13.81,amazon mktplace pmts amzn.com/billgbr,amazon,spend,services,2010-07-10,2012-10-26,259584,2014-03-06,natwest bank,credit card,1945.0,2014-07-18,2017-08-15,1213850,True,0.0,False,False,1.0,,e01002984,amazon,e02000609,kt3 5,london,,enjoyment,no tag,household,no tag,u,2012-08


## Data availability

What proportion of txns are pre-signup?

In [4]:
def pre_signup_share(df):
    return sum(df.date < df.user_registration_date) / len(df)


print("{:.1%} of observations are pre-signup.".format(pre_signup_share(df)))

29.9% of observations are pre-signup.


What's the distribution of pre-signup share of txns across users?

In [9]:
pcts = [0.1, 0.2, 0.3, 0.4, 0.5]
df.groupby("user_id").apply(pre_signup_share).describe(percentiles=pcts)

count    2679.000000
mean        0.431379
std         0.374469
min         0.000000
10%         0.017947
20%         0.042603
30%         0.087974
40%         0.193887
50%         0.335071
max         1.000000
dtype: float64

How many days long is the pre-signup window we can observe?

In [63]:
def pre_window_length(df):
    return abs(min(0, (df.date - df.user_registration_date).dt.days.min()))


df.groupby("user_id").apply(pre_window_length).describe()

count    2679.000000
mean      258.262038
std       294.341061
min         0.000000
25%        67.000000
50%        86.000000
75%       349.000000
max      1106.000000
dtype: float64

## MVP

### Select sample

Create days since registration variable

In [112]:
def select_sample(df):

    df = df.copy()

    # Days since registration counter
    dfp["dsreg"] = (df.date - df.user_registration_date).dt.days

    # Keep users with at least 6 months pre and post window
    g = df.groupby("user_id").dsreg
    cond = g.min().le(-180) & g.max().ge(180)
    users = cond[cond].index
    df = df[df.user_id.isin(users)]

    # Data validation
    assert df.dsreg.min() <= -180

    return df


dfs = select_sample(df)

In [115]:
dfs

Unnamed: 0,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,birth_year,data_warehouse_date_created,data_warehouse_date_last_updated,id,is_debit,is_female,is_sa_flow,is_salary_pmt,is_urban,latest_balance,lsoa,merchant_business_line,msoa,postcode,region_name,salary_range,tag_auto,tag_manual,tag_spend,tag_up,updated_flag,ym,dsreg
1412906,2013-10-18,83377,9.990000,recurrent transaction london spotify spotify premiu,spotify,spend,hobbies,2014-07-02,2014-07-02,148664,2014-10-20 11:26:00,santander,credit card,,2014-07-04,2015-03-19,25233454,True,1.0,False,False,1.0,,e01001945,spotify,e02000375,w12 7,london,,music,no tag,"entertainment, tv, media",music,u,2013-10,-257
1412907,2013-10-23,83377,0.070000,payment protection cover,,,,2014-07-02,2014-07-02,148664,2014-10-20 11:26:00,santander,credit card,,2014-07-04,2018-10-08,25233453,True,1.0,False,False,1.0,,e01001945,non merchant mbl,e02000375,w12 7,london,,,no tag,,payment protection insurance,u,2013-10,-252
1412908,2013-10-24,83377,-10.060000,payment received,,transfers,other_transfers,2014-07-02,2014-07-02,148664,2014-10-20 11:26:00,santander,credit card,,2014-07-04,2017-08-14,25233452,False,1.0,False,False,1.0,,e01001945,account provider,e02000375,w12 7,london,,credit card,no tag,,credit card payment,u,2013-10,-251
1412909,2014-04-03,83377,30.000000,"cash withdrawal at lloyds bank plc atm regent str(3094, london,30.00 gbp , on xxxxxx2014",,spend,other_spend,2014-07-02,2014-07-02,148661,2015-01-05 23:29:00,santander,current,,2014-07-04,2017-10-23,24879697,True,1.0,False,False,1.0,93.809998,e01001945,personal,e02000375,w12 7,london,,cash,no tag,cash,cash,u,2014-04,-90
1412910,2014-04-04,83377,43.570000,waitrose 462,waitrose,spend,household,2014-07-02,2014-07-02,148663,2019-02-20 04:55:00,nationwide,credit card,,2014-07-04,2017-08-13,24943215,True,1.0,False,False,1.0,,e01001945,waitrose,e02000375,w12 7,london,,"food, groceries, household",supermarket,groceries,supermarket,u,2014-04,-89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6264017,2020-07-30,568677,11.980000,the range altrincham alttrincham,the range,spend,household,2020-02-01,2020-04-12,1665687,2020-08-16 18:41:00,american express,credit card,1982.0,2020-08-02,1900-01-01,803494009,True,1.0,False,False,1.0,-20593.890625,e01006093,the range,e02001281,wa15 6,north west,30k to 40k,home,no tag,home,home,c,2020-07,180
6264018,2020-07-30,568677,29.990000,asda filling station (o altrincham,asda,spend,motor,2020-02-01,2020-04-12,1665687,2020-08-16 18:41:00,american express,credit card,1982.0,2020-08-02,1900-01-01,803494007,True,1.0,False,False,1.0,-20593.890625,e01006093,asda fuel,e02001281,wa15 6,north west,30k to 40k,fuel,no tag,vehicle,fuel,c,2020-07,180
6264019,2020-07-30,568677,29.520000,asda filling station (o altrincham,asda,spend,motor,2020-02-01,2020-04-12,1665687,2020-08-16 18:41:00,american express,credit card,1982.0,2020-08-02,1900-01-01,803494008,True,1.0,False,False,1.0,-20593.890625,e01006093,asda fuel,e02001281,wa15 6,north west,30k to 40k,fuel,no tag,vehicle,fuel,c,2020-07,180
6264020,2020-07-30,568677,122.209999,asda altrincham altrincham,asda,spend,household,2020-02-01,2020-04-12,1665687,2020-08-16 18:41:00,american express,credit card,1982.0,2020-08-02,1900-01-01,803494006,True,1.0,False,False,1.0,-20593.890625,e01006093,asda supermarket,e02001281,wa15 6,north west,30k to 40k,"food, groceries, household",no tag,groceries,"food, groceries, household",c,2020-07,180


## Notes

Decisions:
- How long does pre- and post-signup window have to be? Will need full sample regardless.

Notes:
- Might need full data to get reasonable sample size