Notebook purpose

- Determine appropriate model specifications

Background: 

- avg hh savings in uk: https://www.nimblefins.co.uk/savings-accounts/average-household-savings-uk#nogo

In [167]:
import contextlib
import os
import re
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.selectors as sl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load raw data

In [2]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 3.29 seconds
Time for read_sample    : 5.14 minutes
(14,839,981, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,1859849,2012-11-05,57,-700.0,faster payments receipt ref mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,8504.292969,22136.039062,2.397678
1,1859848,2012-11-08,57,-900.0,faster payments receipt from mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,9404.292969,22136.039062,2.397678


## Make analysis data

Make user-ym panel comprising dependent and independent variables

In [55]:
month = pd.Grouper(key="date", freq="m")
idx_cols = ["user_id", month]

column_makers = []


def column_adder(func):
    column_makers.append(func)
    return func


@column_adder
def obs_count(df):
    return df.groupby(idx_cols).id.count().rename("obs")


@column_adder
def account_balances(df):
    """Calculates average monthly balances for user's savings and current accounts."""
    return (
        df
        # daily account balances
        .groupby(
            ["user_id", "account_type", "account_id", "date"],
            observed=True,
        )
        .balance.first()
        # daily account type balances
        .groupby(["user_id", "account_type", "date"], observed=True)
        .sum()
        # monthly account type mean balance
        .reset_index()
        .set_index("date")
        .groupby(["user_id", "account_type"])
        .balance.resample("m")
        .mean()
        # reformat
        .unstack(level="account_type")
        .rename(columns={"current": "balance_ca", "savings": "balance_sa"})
    )


@column_adder
def savings_accounts_flows(df):
    """Calculates monthly inflows, outflows, and net-inflows into user's savings accounts."""
    df = df.copy()
    df["debit"] = df.debit.replace({True: "sa_outflows", False: "sa_inflows"})
    is_not_interest_txn = ~df.tag_auto.str.contains("interest", na=False)
    is_savings_account = df.account_type.eq("savings")
    mask = is_not_interest_txn & is_savings_account
    group_cols = idx_cols + ["income", "debit"]

    return (
        df[mask]
        .groupby(group_cols)
        .amount.sum()
        .abs()
        .unstack()
        .fillna(0)
        .reset_index("income")
        .assign(
            sa_net_inflows=lambda df: df.sa_inflows - df.sa_outflows,
            sa_scaled_inflows=lambda df: df.sa_inflows / (df.income / 12) * 100,
            sa_scaled_outflows=lambda df: df.sa_outflows / (df.income / 12) * 100,
            sa_scaled_net_inflows=lambda df: df.sa_scaled_inflows
            - df.sa_scaled_outflows,
        )
        .drop(columns="income")
    )


@column_adder
def tag_entropy(df):
    return df.groupby(idx_cols).entropy_sptac.first()


@column_adder
def total_monthly_spend(df):
    mask = df.tag_group.eq("spend")
    return df[mask].groupby(idx_cols).amount.sum().rename("total_monthly_spend")


@column_adder
def tag_monthly_spend(df):
    df = df.copy()
    df["tag"] = df.tag.cat.rename_categories(lambda x: "tag_spend_" + x)
    mask = df.tag_group.eq("spend")
    group_cols = idx_cols + ["tag"]
    return df[mask].groupby(group_cols, observed=True).amount.sum().unstack().fillna(0)


def main(df):
    return pd.concat((func(df) for func in column_makers), axis=1)


filepath = "/Users/fgu/tmp/en/analysis_data_XX7.parquet"
# data = main(df)
# ha.write_parquet(data, filepath, index=True)

data = ha.read_parquet(filepath)
hd.inspect(data)

(174,655, 20)


Unnamed: 0_level_0,Unnamed: 1_level_0,obs,balance_ca,balance_sa,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,entropy_sptac,total_monthly_spend,tag_spend_household,tag_spend_other_spend,tag_spend_services,tag_spend_travel,tag_spend_hobbies,tag_spend_retail,tag_spend_finance,tag_spend_communication,tag_spend_motor
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57,2012-11-30,20.0,8447.646484,,,,,,,,2.397678,1411.040039,1230.430054,90.0,64.610001,26.0,0.0,0.0,0.0,0.0,0.0
57,2012-12-31,32.0,8575.141602,,,,,,,,2.498083,1740.310059,1398.550049,121.400002,129.759995,80.099998,10.5,0.0,0.0,0.0,0.0


In [None]:
data.loc[57, "2013"]

  return self[key]


Unnamed: 0_level_0,obs,balance_ca,balance_sa,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,entropy_sptac,total_monthly_spend,tag_spend_household,tag_spend_other_spend,tag_spend_services,tag_spend_travel,tag_spend_hobbies,tag_spend_retail,tag_spend_finance,tag_spend_communication,tag_spend_motor
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-31,38.0,8174.982422,,,,,,,,2.23037,1454.369995,1268.680054,44.799999,140.889999,0.0,0.0,0.0,0.0,0.0,0.0
2013-02-28,37.0,7484.873535,,,,,,,,2.489379,1764.51001,1287.640015,70.0,271.910004,110.699997,0.0,24.26,0.0,0.0,0.0
2013-03-31,40.0,6646.179199,,,,,,,,2.361511,1481.949951,1172.589966,100.0,195.080002,0.0,0.0,14.28,0.0,0.0,0.0
2013-04-30,41.0,6857.299805,,,,,,,,2.308423,2051.320068,1591.839966,0.0,315.630005,142.600006,0.0,0.0,1.25,0.0,0.0
2013-05-31,27.0,6670.289062,,,,,,,,2.474028,1499.0,1128.449951,0.0,357.559998,0.0,0.0,12.99,0.0,0.0,0.0
2013-06-30,,,,,,,,,,,,,,,,,,,,
2013-07-31,,,,,,,,,,,,,,,,,,,,
2013-08-31,,,,,,,,,,,,,,,,,,,,
2013-09-30,,,,,,,,,,,,,,,,,,,,
2013-10-31,,,,,,,,,,,,,,,,,,,,


In [138]:
data[data.obs.isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,obs,balance_ca,balance_sa,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,entropy_sptac,total_monthly_spend,tag_spend_household,tag_spend_other_spend,tag_spend_services,tag_spend_travel,tag_spend_hobbies,tag_spend_retail,tag_spend_finance,tag_spend_communication,tag_spend_motor
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57,2013-06-30,,,,,,,,,,,,,,,,,,,,
57,2013-07-31,,,,,,,,,,,,,,,,,,,,
57,2013-08-31,,,,,,,,,,,,,,,,,,,,
57,2013-09-30,,,,,,,,,,,,,,,,,,,,
57,2013-10-31,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525747,2019-10-31,,,,,,,,,,,,,,,,,,,,
525747,2019-11-30,,,,,,,,,,,,,,,,,,,,
525747,2019-12-31,,,,,,,,,,,,,,,,,,,,
543587,2019-08-31,,,,,,,,,,,,,,,,,,,,


In [137]:
data.isna().sum()

obs                           239
balance_ca                    420
balance_sa                 112150
sa_inflows                 117417
sa_outflows                117417
sa_net_inflows             117417
sa_scaled_inflows          117417
sa_scaled_outflows         117417
sa_scaled_net_inflows      117417
entropy_sptac                 239
total_monthly_spend           410
tag_spend_household           410
tag_spend_other_spend         410
tag_spend_services            410
tag_spend_travel              410
tag_spend_hobbies             410
tag_spend_retail              410
tag_spend_finance             410
tag_spend_communication       410
tag_spend_motor               410
dtype: int64

## Sumstats

In [10]:
def trim_columns_upper(df, pct=0, ends="upper"):
    return df.apply(lambda x: hd.trim(x, pct=pct))


def colname_subset(df, pattern):
    """Returns names of all columns that contain pattern."""
    columns = df.columns
    return columns[columns.str.contains(pattern)]


def sumstats_table(df, cols):
    """Creats basic summary statistics table for colums."""
    order = ["count", "mean", "std", "min", "max", "25%", "50%", "75%"]
    return df[cols].pipe(trim_columns_upper, pct=5).describe().T[order]


def make_sumstat_table(sumstats_table):
    return tabulate.tabulate(sumstats_table, headers="keys", tablefmt="latex_booktabs")


def write_table(table, path):
    """Writes printed table to path."""
    with open(path, "w+") as f:
        with contextlib.redirect_stdout(f):
            print(table)
    print(f"Table written to {path}.")


cols = colname_subset(data, "\w")
sumstats = sumstats_table(data, cols)
latex_sumstats_table = make_sumstat_table(sumstats)
fp = os.path.join(config.TABDIR, "sumstats.tex")
write_table(latex_sumstats_table, fp)
sumstats

Table written to /Users/fgu/dev/projects/entropy/output/tables/sumstats.tex.


Unnamed: 0,count,mean,std,min,max,25%,50%,75%
obs,157120.0,81.384604,35.300652,25.0,174.0,53.0,77.0,105.0
balance_ca,156811.0,1125.986694,2598.335449,-3216.270752,12724.854492,-230.618042,450.785431,1648.046021
balance_sa,56253.0,2291.717529,3342.39502,-47.500488,17509.5625,176.823792,836.731689,2895.459229
sa_inflows,54406.0,516.375061,718.690674,0.0,4000.0,50.0,225.559998,650.0
sa_outflows,54376.0,486.503265,747.157837,0.0,3974.290039,0.0,152.350006,648.657532
sa_net_inflows,51551.0,37.441082,563.800232,-2005.5,2000.0,-120.0,25.0,247.190002
sa_scaled_inflows,54376.0,24.493491,33.186538,0.0,177.896092,2.40662,11.411203,31.979029
sa_scaled_outflows,54376.0,23.261654,35.139095,0.0,183.834022,0.0,7.744037,31.390109
sa_scaled_net_inflows,51514.0,1.761589,25.985012,-91.728469,90.124501,-6.038416,1.157172,11.878406
entropy_sptac,156976.0,2.583464,0.180323,2.157022,2.918182,2.454662,2.596189,2.723571


## Bivariate plots

In [None]:
def make_data(df, col_names=None, **trim_kws):
    if col_names is None:
        col_names = df.columns
    return (
        df.set_index("entropy_sptac", append=True)
        .loc[:, col_names]
        .melt(ignore_index=False)
        .reset_index("entropy_sptac")
    )


def make_plot(df):
    sns.lmplot(
        data=df,
        y="value",
        x="entropy_sptac",
        x_bins=20,
        ci=99,
        col="variable",
        col_wrap=2,
    )


pattern = "sa_|balance_"
outcome_vars = colname_subset(data, pattern)
d = make_data(data, col_names=outcome_vars)
s = d.sample(frac=0.1)
make_plot(s)

## Main results

Full FE equation as baseline

## Lagged-dependent variable