Notebook purpose:

- Explore whether we can analyse effect of MDB adoption

In [1]:
import functools
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import scipy
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.aggregators as ag
import entropy.data.cleaners as cl
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.data.validators as vl
import entropy.eval as ev
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2
%matplotlib inline

fs = s3fs.S3FileSystem(profile="3di")

In [2]:
dft = hd.read_txn_data("X77")
hd.inspect(dft)

Time for read_txn_data                 : 56.31 seconds
shape: (6,627,663, 35), users: 2679


Unnamed: 0,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,birth_year,data_warehouse_date_created,data_warehouse_date_last_updated,id,is_debit,is_female,is_sa_flow,is_salary_pmt,is_urban,latest_balance,lsoa,merchant_business_line,msoa,postcode,region_name,salary_range,tag_auto,tag_manual,tag_spend,tag_up,updated_flag,ym
0,2012-08-01,77,12.12,tv licence mbp - d/d,tv licensing,spend,communication,2010-07-10,2012-10-26,259583,2015-09-11,natwest bank,current,1945.0,2014-07-18,2015-03-19,1212601,True,0.0,False,False,1.0,,e01002984,tv licensing,e02000609,kt3 5,london,,tv licence,no tag,"entertainment, tv, media",tv licence,u,2012-08
1,2012-08-01,77,13.81,amazon mktplace pmts amzn.com/billgbr,amazon,spend,services,2010-07-10,2012-10-26,259584,2014-03-06,natwest bank,credit card,1945.0,2014-07-18,2017-08-15,1213850,True,0.0,False,False,1.0,,e01002984,amazon,e02000609,kt3 5,london,,enjoyment,no tag,household,no tag,u,2012-08


## Data availability

What proportion of txns are pre-signup?

In [3]:
def pre_signup_share(df):
    return sum(df.date < df.user_registration_date) / len(df)


print("{:.1%} of observations are pre-signup.".format(pre_signup_share(dft)))

29.9% of observations are pre-signup.


What's the distribution of pre-signup share of txns across users?

In [4]:
pcts = [0.1, 0.2, 0.3, 0.4, 0.5]
dft.groupby("user_id").apply(pre_signup_share).describe(percentiles=pcts)

count    2679.000000
mean        0.431379
std         0.374469
min         0.000000
10%         0.017947
20%         0.042603
30%         0.087974
40%         0.193887
50%         0.335071
max         1.000000
dtype: float64

How many days long is the pre-signup window we can observe?

In [5]:
def pre_window_length(df):
    min_window_length = (df.date - df.user_registration_date).dt.days.min()
    return abs(min(0, min_window_length))


dft.groupby("user_id").apply(pre_window_length).describe()

count    2679.000000
mean      258.262038
std       294.341061
min         0.000000
25%        67.000000
50%        86.000000
75%       349.000000
max      1106.000000
dtype: float64

Suggests we'll have to restrict sample quite considerably if we want a window of 6 or even 12 months.

### Proportion of accounts observed throughout

What proportion of accounts do we observe throughout?

In [15]:
dft.groupby("user_id").account_created.nunique().describe(
    percentiles=[0.5, 0.6, 0.7, 0.8, 0.9]
)

count    2679.000000
mean        2.039194
std         2.198588
min         1.000000
50%         1.000000
60%         1.000000
70%         2.000000
80%         3.000000
90%         4.000000
max        29.000000
Name: account_created, dtype: float64

In [18]:
(dft.account_created == dft.user_registration_date).mean()

0.6519130498940576