In [1]:
import contextlib
import os
import re
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [217]:
df = hd.read_sample("X77")
hd.inspect(df)

Time for read_sample    : 21.02 seconds
(1,745,890, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,51154,2012-01-01,177,42.349998,windmill windlesham 1,,,,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-10-23,True,-114.349998,,,dining and drinking,dining and drinking,u,201201,3207.884521,38445.410156,2.840509
1,51155,2012-01-01,177,9.99,spotify m1nkeh london 9 99 pound sterling united kingdom,spotify,spend,hobbies,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2016-04-03,True,-114.349998,spotify,music,media bundle,media bundle,u,201201,3207.884521,38445.410156,2.840509


Gaps

In [223]:
a = df.set_index("date").groupby("user_id").resample("m").id.count().index
b = df.groupby(["user_id", month]).id.count().index

all(a == b)

True

Tag consistency

In [226]:
df[df.tag_group.eq('spend')].tag.nunique()

9

In [6]:
sample = "X77"
fn = os.path.join("s3://3di-data-mdb/raw", f"mdb_{sample}.parquet")
raw = ha.read_parquet(fn)
clean = md.clean_data(raw)
hd.inspect(clean)

(6,765,276, 28)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym
4328,1212603,2012-07-30,77,40.0,lloyds bank 28jul,,spend,other_spend,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-24,True,,personal,cash,,cash,u,201207
4330,1212608,2012-07-30,77,10.0,mdbremoved,,,,0.0,kt3 5,2010-07-10,,1945.0,2012-10-26,259583,2015-09-11,natwest bank,current,2014-07-18,2017-10-23,True,,non merchant mbl,,,,u,201207


In [None]:
def tester(df):
    g = df.groupby("user_id")
    latest_balances_available = g.latest_balance.min().notna()
    valid_last_refresh_dates = g.account_last_refreshed.min() >= g.date.min()
    cond = latest_balances_available & valid_last_refresh_dates
    users = cond[cond].index
    return df[df.user_id.isin(users)]

In [120]:
data = (
    clean.pipe(sl.min_number_of_months)
    .pipe(sl.no_missing_months)
    .pipe(sl.account_balances_available)
)

In [121]:
data.user_id.nunique(), sl.no_missing_months(data).user_id.nunique()

(1652, 1652)

In [173]:
df = hd.read_sample("X77")
hd.inspect(df)

Time for read_sample    : 23.74 seconds
(1,745,890, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,51154,2012-01-01,177,42.349998,windmill windlesham 1,,,,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2017-10-23,True,-114.349998,,,dining and drinking,dining and drinking,u,201201,3207.884521,38445.410156,2.840509
1,51155,2012-01-01,177,9.99,spotify m1nkeh london 9 99 pound sterling united kingdom,spotify,spend,hobbies,1.0,gu14 9,2010-07-11,,,2011-12-31,178553,2020-06-30 14:04:00,barclaycard,credit card,2014-07-18,2016-04-03,True,-114.349998,spotify,music,media bundle,media bundle,u,201201,3207.884521,38445.410156,2.840509


In [210]:
df.user_id.nunique(), sl.no_missing_months(df).user_id.nunique()

(605, 605)

## tags

In [141]:
import entropy.data.txn_classifications as tc

Check that definitions are correct

In [146]:
tc.tag_groups.keys()

dict_keys(['income', 'spend', 'transfers'])

In [197]:
from_groups = set(tc.tag_groups["transfers"])

elements = []
for key in tc.transfers_subgroups:
    elements += tc.transfers_subgroups[key]
from_subgroups = set(elements)


assert len(elements) == len(from_subgroups)
assert from_subgroups == from_groups

In [209]:
a = set(tc.tag_groups["income"])
b = set(tc.tag_groups["spend"])
c = set(tc.tag_groups["transfers"])

b & c

set()

In [212]:
m = hd.read_sample("777")
m

Time for read_sample    : 6.81 seconds


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,2981373,2012-10-09,7777,1400.000000,mdbremoved,,,,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-10-23,True,3110.219971,non merchant mbl,,,,u,201210,4080.421875,28011.000000,2.919691
1,2981372,2012-10-10,7777,30.000000,000054,,spend,other_spend,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2018-04-30,True,3110.219971,personal,cash,,,u,201210,4050.421875,28011.000000,2.919691
2,2981516,2012-10-16,7777,11.250000,sodhexo defence,sodexo,spend,services,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,True,22128.669922,sodexo,lunch or snacks,groceries,groceries,u,201210,19822.625000,28011.000000,2.919691
3,2981371,2012-10-16,7777,12.950000,25sep a c 7322 charge,,spend,finance,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-08-15,True,3110.219971,account provider,bank charges,,bank charges,u,201210,4037.471924,28011.000000,2.919691
4,2981515,2012-10-17,7777,9.990000,policy admin servs,phones 4 u,spend,communication,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,True,22128.669922,phones 4 u,mobile,home insurance,home insurance,u,201210,19812.634766,28011.000000,2.919691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169244,768266742,2018-12-31,582777,7.990000,wh smith cheltenham,wh smith,spend,retail,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,wh smith,books / magazines / newspapers,,books / magazines / newspapers,c,201812,4283.314453,7085.468471,2.551491
169245,768268663,2018-12-31,582777,17.990000,hmv retail ltd cheltenham,hmv,spend,services,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,hmv,"entertainment, tv, media",,"entertainment, tv, media",c,201812,4283.314453,7085.468471,2.551491
169246,768261172,2018-12-31,582777,3.430000,lidl uk cheltenhamcheltenham,lidl,spend,household,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491
169247,768262440,2018-12-31,582777,32.299999,lidl uk cheltenhamcheltenham,lidl,spend,household,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491


In [213]:
m[m.tag_group.eq("spend")].tag.unique()

['other_spend', 'services', 'finance', 'communication', 'motor', 'household', 'retail', 'travel', 'hobbies']
Categories (15, object): ['benefits', 'communication', 'earnings', 'finance', ..., 'savings', 'services', 'transfers', 'travel']