Notebook purpose

- Determine appropriate model specifications

In [2]:
import os
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [8]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 2.01 seconds
Time for read_sample    : 4.67 minutes
(14,839,981, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,1859849,2012-11-05,57,-700.0,faster payments receipt ref mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,8504.292969,22136.039062,2.397678
1,1859848,2012-11-08,57,-900.0,faster payments receipt from mdbremoved,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2012-10-23,274000,2020-08-16 07:24:00,santander,current,2014-07-18,2017-08-15,False,14829.769531,,,,current account,u,201211,9404.292969,22136.039062,2.397678


## Make analysis data

In [9]:
def total_monthly_spend(df):
    group_vars = ["user_id", "ym"]
    mask = df.tag_group.eq("spend")
    s = df[mask].groupby(group_vars).amount.sum().rename("total_monthly_spend")
    return df.merge(s, on=group_vars)


def tag_monthly_spend(df):
    df = df.copy()
    group_vars = ["user_id", "ym", "tag"]
    mask = df.tag_group.eq("spend")
    df["tag"] = df.tag.cat.rename_categories(lambda x: "tag_spend_" + x)
    s = (
        df[mask]
        .groupby(group_vars, observed=True)
        .amount.sum()
        .fillna(0)
        .rename("tag_monthly_spend")
        .unstack()
    )
    return df.merge(s, on=["user_id", "ym"])


def monthly_savings(df, trim_pct=5):
    """Aggregates df into inflows, outflows, and net, by user month, trims at
    specified percentile, and scales flows by user's monthly income.
    """

    def trim_column_values(df, **kwargs):
        return df.apply(hd.trim, **kwargs)

    df = df.copy()
    is_not_interest_txn = ~df.tag_auto.str.contains("interest", na=False)
    is_savings_account = df.account_type.eq("savings")
    mask = is_not_interest_txn & is_savings_account
    df["debit"] = df.debit.replace({True: "sa_outflows", False: "sa_inflows"})

    s = (
        df[mask]
        .groupby(["user_id", "ym", "income", "debit"])
        .amount.sum()
        .abs()
        .unstack()
        .fillna(0)
        .reset_index("income")
        .assign(
            sa_inflows=lambda df: df.sa_inflows / (df.income / 12) * 100,
            sa_outflows=lambda df: df.sa_outflows / (df.income / 12) * 100,
            sa_net_inflows=lambda df: df.sa_inflows - df.sa_outflows,
        )
        .drop(columns="income")
        .pipe(trim_column_values, pct=trim_pct)
    )

    return df.merge(s, how="inner", on=["user_id", "ym"])


def make_analysis_data(df):
    df = df.copy()
    # add variables - temp
    df = df.pipe(total_monthly_spend).pipe(tag_monthly_spend).pipe(monthly_savings)

    tag_spend = df.columns[df.columns.str.startswith("tag_spend")]

    # collapse to user-ym
    cols = [
        "sa_inflows",
        "sa_net_inflows",
        "entropy_sptac",
        "income",
        "total_monthly_spend",
    ] + list(tag_spend)
    return df.groupby(["user_id", "ym"]).first()[cols]


analysis_data = make_analysis_data(df)
hd.inspect(analysis_data)

(57,158, 14)


Unnamed: 0_level_0,Unnamed: 1_level_0,sa_inflows,sa_net_inflows,entropy_sptac,income,total_monthly_spend,tag_spend_household,tag_spend_other_spend,tag_spend_services,tag_spend_travel,tag_spend_hobbies,tag_spend_retail,tag_spend_finance,tag_spend_communication,tag_spend_motor
user_id,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
57,201805,5.825108,5.825108,2.539643,61801.429688,1538.079956,841.75,55.75,438.25,30.150002,28.98,11.24,1.53,130.429993,
57,201806,9.708513,9.708513,2.497777,61801.429688,1560.619995,485.529999,75.25,575.039978,49.620003,28.98,16.33,1.91,327.959991,


In [14]:
fp = "~/tmp/en/analysis_data_XX7.parquet"
ha.write_parquet(analysis_data, fp)

Unnamed: 0_level_0,Unnamed: 1_level_0,sa_inflows,sa_net_inflows,entropy_sptac,income,total_monthly_spend,tag_spend_household,tag_spend_other_spend,tag_spend_services,tag_spend_travel,tag_spend_hobbies,tag_spend_retail,tag_spend_finance,tag_spend_communication,tag_spend_motor
user_id,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
57,201805,5.825108,5.825108,2.539643,61801.429688,1538.079956,841.750000,55.750000,438.250000,30.150002,28.98,11.240000,1.530000,130.429993,
57,201806,9.708513,9.708513,2.497777,61801.429688,1560.619995,485.529999,75.250000,575.039978,49.620003,28.98,16.330000,1.910000,327.959991,
57,201807,9.708513,9.708513,2.325304,61801.429688,3989.780029,1280.449951,622.929993,1457.319946,319.250000,48.93,182.290009,4.350000,74.260002,
57,201808,9.708513,9.708513,2.543220,61801.429688,1646.410034,919.809998,105.919998,338.190002,233.929993,28.98,12.700000,-147.599991,85.040001,69.440002
57,201809,9.708513,9.708513,2.187621,61801.429688,2607.159912,781.000000,40.740002,819.710022,300.950012,28.98,589.770020,1.520000,44.489998,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589317,201910,21.198599,3.873917,2.507676,52192.128906,4767.540039,3268.590088,110.599991,988.799988,37.619999,10.00,-356.050018,639.559998,66.730003,1.690000
589317,201912,7.633335,7.633335,2.515537,52192.128906,5209.120117,3746.989990,307.549988,814.289978,37.619999,13.15,121.320000,44.230000,50.189999,73.779999
589317,202001,0.000000,-15.167995,2.493242,45603.652902,4899.109863,4298.029785,50.000000,379.989990,37.619999,10.00,19.490000,48.389999,50.189999,5.400000
589317,202004,0.000000,-0.037629,2.434146,45603.652902,4018.739990,3357.260010,68.949997,461.989990,19.590000,10.65,,49.389999,50.910000,


## Bivariate plots

In [None]:
sns.distplot(analysis_data, x="entropy_sptac", y="sa_net_inflows")

## Main results

Full FE equation as baseline

## Lagged-dependent variable