Notebook purpose

- Determine appropriate model specifications

In [1]:
import os
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [2]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 4.47 seconds
Time for read_sample    : 4.85 minutes
(14,839,981, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,1859849,2012-11-05,57,-700.0,faster payments receipt ref mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,8504.292969,22136.039062,2.397678
1,1859848,2012-11-08,57,-900.0,faster payments receipt from mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,9404.292969,22136.039062,2.397678


## Make analysis data

In [22]:
dfs[dfs.tag_group.eq("spend")].groupby("tag").amount.sum()

tag
benefits              0.000000
communication     97419.421875
earnings              0.000000
finance          466803.312500
hobbies           49798.539062
household        972438.000000
motor             93256.523438
other_income          0.000000
other_spend      383822.281250
pensions              0.000000
retail           130843.078125
savings               0.000000
services         584479.625000
transfers             0.000000
travel           194286.828125
Name: amount, dtype: float32

In [113]:
def total_monthly_spend(df):
    group_vars = ["user_id", "ym"]
    mask = df.tag_group.eq("spend")
    s = df[mask].groupby(group_vars).amount.sum().rename("total_monthly_spend")
    return df.merge(s, on=group_vars)


def tag_monthly_spend(df):
    df = df.copy()
    group_vars = ["user_id", "ym", "tag"]
    mask = df.tag_group.eq("spend")
    df["tag"] = df.tag.cat.rename_categories(lambda x: "tag_spend_" + x)
    s = (
        df[mask]
        .groupby(group_vars, observed=True)
        .amount.sum()
        .rename("tag_monthly_spend")
        .unstack()
    )
    return df.merge(s, on=["user_id", "ym"])


def monthly_savings(df, trim_pct=5):
    """Aggregates df into inflows, outflows, and net, by user month, trims at
    specified percentile, and scales flows by user's monthly income.
    """

    def trim_column_values(df, **kwargs):
        return df.apply(hd.trim, **kwargs)

    mask = df.account_type.eq("savings") & ~df.tag_auto.str.contains(
        "interest", na=False
    )
    df["debit"] = df.debit.replace({True: "debit", False: "credit"})
    s = (
        df[mask]
        .groupby(["user_id", "ym", "income", "debit"])
        .amount.sum()
        .abs()
        .unstack()
        .reset_index("income")
        .assign(
            sa_inflows=lambda df: df.credit / (df.income / 12) * 100,
            sa_outflows=lambda df: df.debit / (df.income / 12) * 100,
        )
        .fillna(0)
        .assign(sa_inflows_net=lambda df: df.sa_inflows - df.sa_outflows)
        .drop(columns=["income", "debit", "credit"])
        .pipe(trim_column_values, pct=trim_pct)
    )
    return df.merge(s, how="inner", on=["user_id", "ym"])


def make_analysis_data(df):
    df = df.copy()
    # add variables - temp
    df = df.pipe(total_monthly_spend).pipe(tag_monthly_spend).pipe(monthly_savings)

    tag_spend = df.columns[df.columns.str.startswith("tag_spend")]

    # collapse to user-ym
    cols = [
        "sa_inflows",
        "sa_inflows_net",
        "entropy_sptac",
        "income",
        "total_monthly_spend",
    ] + list(tag_spend)
    return cols
    return df.groupby(["user_id", "ym"]).first()[cols]


# analysis_data = make_analysis_data(dfs)
# hd.inspect(analysis_data)
make_analysis_data(dfs)

['sa_inflows',
 'sa_inflows_net',
 'entropy_sptac',
 'income',
 'total_monthly_spend',
 'tag_spend_other_spend',
 'tag_spend_services',
 'tag_spend_finance',
 'tag_spend_communication',
 'tag_spend_motor',
 'tag_spend_household',
 'tag_spend_retail',
 'tag_spend_travel',
 'tag_spend_hobbies']

In [105]:
df.columns.str.startswith("id")

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [89]:
dfs

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,2981373,2012-10-09,7777,1400.000000,mdbremoved,,,,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-10-23,debit,3110.219971,non merchant mbl,,,,u,201210,4080.421875,28011.000000,2.919691
1,2981372,2012-10-10,7777,30.000000,000054,,spend,tag_spend_tag_spendtag_spendother_spend,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2018-04-30,debit,3110.219971,personal,cash,,,u,201210,4050.421875,28011.000000,2.919691
2,2981516,2012-10-16,7777,11.250000,sodhexo defence,sodexo,spend,tag_spend_tag_spendtag_spendservices,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,debit,22128.669922,sodexo,lunch or snacks,groceries,groceries,u,201210,19822.625000,28011.000000,2.919691
3,2981371,2012-10-16,7777,12.950000,25sep a c 7322 charge,,spend,tag_spend_tag_spendtag_spendfinance,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-08-15,debit,3110.219971,account provider,bank charges,,bank charges,u,201210,4037.471924,28011.000000,2.919691
4,2981515,2012-10-17,7777,9.990000,policy admin servs,phones 4 u,spend,tag_spend_tag_spendtag_spendcommunication,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,debit,22128.669922,phones 4 u,mobile,home insurance,home insurance,u,201210,19812.634766,28011.000000,2.919691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152860,768266742,2018-12-31,582777,7.990000,wh smith cheltenham,wh smith,spend,tag_spend_tag_spendtag_spendretail,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,debit,1033.229980,wh smith,books / magazines / newspapers,,books / magazines / newspapers,c,201812,4283.314453,7085.468471,2.551491
152861,768268663,2018-12-31,582777,17.990000,hmv retail ltd cheltenham,hmv,spend,tag_spend_tag_spendtag_spendservices,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,debit,1033.229980,hmv,"entertainment, tv, media",,"entertainment, tv, media",c,201812,4283.314453,7085.468471,2.551491
152862,768261172,2018-12-31,582777,3.430000,lidl uk cheltenhamcheltenham,lidl,spend,tag_spend_tag_spendtag_spendhousehold,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,debit,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491
152863,768262440,2018-12-31,582777,32.299999,lidl uk cheltenhamcheltenham,lidl,spend,tag_spend_tag_spendtag_spendhousehold,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,debit,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491


In [99]:
tag_monthly_spend(dfs)

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac,tag_spend_other_spend,tag_spend_services,tag_spend_finance,tag_spend_communication,tag_spend_motor,tag_spend_household,tag_spend_retail,tag_spend_travel,tag_spend_hobbies
0,2981373,2012-10-09,7777,1400.000000,mdbremoved,,,,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-10-23,True,3110.219971,non merchant mbl,,,,u,201210,4080.421875,28011.000000,2.919691,230.0,44.750000,24.930000,155.490005,195.690002,,,,
1,2981372,2012-10-10,7777,30.000000,000054,,spend,tag_spend_other_spend,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2018-04-30,True,3110.219971,personal,cash,,,u,201210,4050.421875,28011.000000,2.919691,230.0,44.750000,24.930000,155.490005,195.690002,,,,
2,2981516,2012-10-16,7777,11.250000,sodhexo defence,sodexo,spend,tag_spend_services,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,True,22128.669922,sodexo,lunch or snacks,groceries,groceries,u,201210,19822.625000,28011.000000,2.919691,230.0,44.750000,24.930000,155.490005,195.690002,,,,
3,2981371,2012-10-16,7777,12.950000,25sep a c 7322 charge,,spend,tag_spend_finance,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151112,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-08-15,True,3110.219971,account provider,bank charges,,bank charges,u,201210,4037.471924,28011.000000,2.919691,230.0,44.750000,24.930000,155.490005,195.690002,,,,
4,2981515,2012-10-17,7777,9.990000,policy admin servs,phones 4 u,spend,tag_spend_communication,0.0,gu14 9,2012-10-27,,1983.0,2013-01-06,151110,2020-03-11 16:27:00,natwest bank,current,2014-07-18,2017-11-13,True,22128.669922,phones 4 u,mobile,home insurance,home insurance,u,201210,19812.634766,28011.000000,2.919691,230.0,44.750000,24.930000,155.490005,195.690002,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152853,768266742,2018-12-31,582777,7.990000,wh smith cheltenham,wh smith,spend,tag_spend_retail,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,wh smith,books / magazines / newspapers,,books / magazines / newspapers,c,201812,4283.314453,7085.468471,2.551491,56.5,302.139984,45.709999,22.990000,,292.850006,109.120003,14.8,26.99
152854,768268663,2018-12-31,582777,17.990000,hmv retail ltd cheltenham,hmv,spend,tag_spend_services,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,hmv,"entertainment, tv, media",,"entertainment, tv, media",c,201812,4283.314453,7085.468471,2.551491,56.5,302.139984,45.709999,22.990000,,292.850006,109.120003,14.8,26.99
152855,768261172,2018-12-31,582777,3.430000,lidl uk cheltenhamcheltenham,lidl,spend,tag_spend_household,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491,56.5,302.139984,45.709999,22.990000,,292.850006,109.120003,14.8,26.99
152856,768262440,2018-12-31,582777,32.299999,lidl uk cheltenhamcheltenham,lidl,spend,tag_spend_household,1.0,gl52 6,2020-05-05,,,2020-05-05,1683911,2020-05-05 14:29:00,hsbc,current,2020-05-06,1900-01-01,True,1033.229980,lidl,"food, groceries, household",,"food, groceries, household",c,201812,4283.314453,7085.468471,2.551491,56.5,302.139984,45.709999,22.990000,,292.850006,109.120003,14.8,26.99


## Main results

Full FE equation as baseline

## Lagged-dependent variable