In [1]:
import contextlib
import os
import re
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [6]:
sample = "X77"
fn = os.path.join("s3://3di-data-mdb/raw", f"mdb_{sample}.parquet")
raw = ha.read_parquet(fn)
clean = md.clean_data(raw)
hd.inspect(clean)

(6,765,276, 28)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym
4328,1212603,2012-07-30,77,40.0,lloyds bank 28jul,,spend,other_spend,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-24,True,,personal,cash,,cash,u,201207
4330,1212608,2012-07-30,77,10.0,mdbremoved,,,,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-23,True,,non merchant mbl,,,,u,201207


In [None]:
def tester(df):
    g = df.groupby("user_id")
    latest_balances_available = g.latest_balance.min().notna()
    valid_last_refresh_dates = g.account_last_refreshed.min() >= g.date.min()
    cond = latest_balances_available & valid_last_refresh_dates
    users = cond[cond].index
    return df[df.user_id.isin(users)]

In [112]:
data = (
    clean.pipe(sl.min_number_of_months)
    .pipe(sl.no_missing_months)
    .pipe(sl.account_balances_available)
)

In [113]:
data.user_id.nunique(), sl.no_missing_months(data).user_id.nunique()

(1734, 1734)

In [114]:
df = hd.read_sample("X77")

Time for read_sample    : 28.53 seconds


In [115]:
df.user_id.nunique(), sl.no_missing_months(df).user_id.nunique()

(599, 595)

In [119]:
data

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym
641,51154,2012-01-01,177,42.349998,windmill windlesham 1,,,,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-10-23,True,-114.349998,,,dining and drinking,dining and drinking,u,201201
642,51155,2012-01-01,177,9.990000,spotify m1nkeh london 9 99 pound sterling united kingdom,spotify,spend,hobbies,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2016-04-03,True,-114.349998,spotify,music,media bundle,media bundle,u,201201
596,51156,2012-01-03,177,-1055.569946,payment thank you,barclaycard,transfers,transfers,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-08-15,False,-114.349998,barclaycard,credit card,,current account,u,201201
613,50721,2012-01-03,177,1000.000000,halifax a 5633 9,halifax,spend,finance,1.0,gu14 9,2010-07-11,,,2011-12-07,178558,2016-08-17 04:13:00,smile,current,2014-07-18,2018-04-30,True,505.000000,halifax,personal loan,mortgage payment,mortgage payment,u,201201
614,50714,2012-01-03,177,50.000000,northern rock mdbremoved,northern rock,,,1.0,gu14 9,2010-07-11,,,2011-12-07,178558,2016-08-17 04:13:00,smile,current,2014-07-18,2018-04-30,True,505.000000,northern rock,,saving (general),saving (general),u,201201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6948578,803020182,2020-07-30,589177,26.469999,the range on 29 jul clp,the range,spend,household,1.0,pl2 2,2020-07-25,20k to 30k,1989.0,2020-07-25,1726615,2020-08-16 12:12:00,barclays,current,2020-08-01,1900-01-01,True,-9.590000,the range,home,,home,c,202007
6948682,803020184,2020-07-30,589177,123.320000,sainsbury s s mkt on 29 jul bcc,sainsburys,spend,household,1.0,pl2 2,2020-07-25,20k to 30k,1989.0,2020-07-25,1726615,2020-08-16 12:12:00,barclays,current,2020-08-01,1900-01-01,True,-9.590000,sainsburys supermarket,"food, groceries, household",groceries,groceries,c,202007
6948684,803020183,2020-07-30,589177,14.690000,rss milehouse on 29 jul clp,,,,1.0,pl2 2,2020-07-25,20k to 30k,1989.0,2020-07-25,1726615,2020-08-16 12:12:00,barclays,current,2020-08-01,1900-01-01,True,-9.590000,,,fuel,fuel,c,202007
6948713,803020181,2020-07-30,589177,-369.880005,mdbremoved,paypal,spend,services,1.0,pl2 2,2020-07-25,20k to 30k,1989.0,2020-07-25,1726615,2020-08-16 12:12:00,barclays,current,2020-08-01,1900-01-01,False,-9.590000,paypal,enjoyment,transfers,transfers,c,202007
