In [1]:
import contextlib
import os
import re
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [6]:
sample = "X77"
fn = os.path.join("s3://3di-data-mdb/raw", f"mdb_{sample}.parquet")
raw = ha.read_parquet(fn)
clean = md.clean_data(raw)
hd.inspect(clean)

(6,765,276, 28)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym
4328,1212603,2012-07-30,77,40.0,lloyds bank 28jul,,spend,other_spend,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-24,True,,personal,cash,,cash,u,201207
4330,1212608,2012-07-30,77,10.0,mdbremoved,,,,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-23,True,,non merchant mbl,,,,u,201207


In [None]:
def tester(df):
    g = df.groupby("user_id")
    latest_balances_available = g.latest_balance.min().notna()
    valid_last_refresh_dates = g.account_last_refreshed.min() >= g.date.min()
    cond = latest_balances_available & valid_last_refresh_dates
    users = cond[cond].index
    return df[df.user_id.isin(users)]

In [109]:
data = (
    clean.pipe(sl.min_number_of_months)
    .pipe(sl.no_missing_months)
    .pipe(tester)
    # .pipe.pipe(sl.account_balances_available)
)

In [110]:
data.user_id.nunique(), sl.no_missing_months(data).user_id.nunique()

(1734, 1734)

In [20]:
def account_balances_available(df):
    """Account balances available

    Retains only users for whom we can calculate running balances.
    This requires a non-missing `latest_balance` and a valid
    `account_last_refreshed` date. The latter is invalid if it is smaller
    than the date of the first transaction we observe for the user, which
    happens in a few cases where the date is set to a dummy date like 1 Jan 1990.
    """
    current_or_savings_account = df.account_type.isin(["current", "savings"])
    latest_balance_available = df.latest_balance.notna()
    user_first_observed = df.groupby("user_id").date.transform("min")
    valid_refresh_date = df.account_last_refreshed >= user_first_observed

    return current_or_savings_account

    return df.loc[
        current_or_savings_account & latest_balance_available & valid_refresh_date
    ]


account_balances_available(clean)

4328       True
4330       True
4331       True
4334       True
4343       True
           ... 
6947113    True
6947268    True
6947545    True
6948061    True
6948556    True
Name: account_type, Length: 6765276, dtype: bool

In [104]:
def tester(df):
    g = df.groupby("user_id")
    latest_balances_available = g.latest_balance.min().notna()
    valid_last_refresh_dates = g.account_last_refreshed.min() >= g.date.min()
    cond = latest_balances_available & valid_last_refresh_dates
    users = cond[cond].index
    return df[df.user_id.isin(users)]


tester(clean)

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym
641,51154,2012-01-01,177,42.349998,windmill windlesham 1,,,,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-10-23,True,-114.349998,,,dining and drinking,dining and drinking,u,201201
642,51155,2012-01-01,177,9.990000,spotify m1nkeh london 9 99 pound sterling united kingdom,spotify,spend,hobbies,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2016-04-03,True,-114.349998,spotify,music,media bundle,media bundle,u,201201
596,51156,2012-01-03,177,-1055.569946,payment thank you,barclaycard,transfers,transfers,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-08-15,False,-114.349998,barclaycard,credit card,,current account,u,201201
613,50721,2012-01-03,177,1000.000000,halifax a 5633 9,halifax,spend,finance,1.0,gu14 9,2010-07-11,,,2011-12-07,178558,2016-08-17 04:13:00,smile,current,2014-07-18,2018-04-30,True,505.000000,halifax,personal loan,mortgage payment,mortgage payment,u,201201
614,50714,2012-01-03,177,50.000000,northern rock mdbremoved,northern rock,,,1.0,gu14 9,2010-07-11,,,2011-12-07,178558,2016-08-17 04:13:00,smile,current,2014-07-18,2018-04-30,True,505.000000,northern rock,,saving (general),saving (general),u,201201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6947113,802628333,2020-07-28,589277,87.559998,lloyds bank mtg,lloyds,spend,household,0.0,np19 8,2020-07-29,10k to 20k,1970.0,2020-07-29,1727846,2020-08-16 00:57:00,natwest bank,current,2020-07-30,1900-01-01,True,253.339996,lloyds,mortgage payment,,mortgage payment,c,202007
6947268,802777208,2020-07-28,589277,5.000000,exchanged to xlm,,transfers,transfers,0.0,np19 8,2020-07-29,10k to 20k,1970.0,2020-07-30,1728045,2020-08-16 10:16:00,revolut,current,2020-07-31,1900-01-01,True,7.940000,personal,transfers,,transfers,c,202007
6947545,802675407,2020-07-29,589277,27.950001,lloyds mortgages,lloyds,spend,household,0.0,np19 8,2020-07-29,10k to 20k,1970.0,2020-07-29,1727853,2020-08-16 03:31:00,lloyds bank,current,2020-07-30,1900-01-01,True,1406.550049,lloyds,mortgage payment,,mortgage payment,c,202007
6948061,802919697,2020-07-29,589277,-50.000000,mdbremoved,,,,0.0,np19 8,2020-07-29,10k to 20k,1970.0,2020-07-29,1727846,2020-08-16 00:57:00,natwest bank,current,2020-07-31,1900-01-01,False,253.339996,,,,,c,202007
