Notebook purpose

- Determine appropriate model specifications

In [1]:
import os
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [None]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 4.48 seconds


## Make analysis data

In [22]:
dfs[dfs.tag_group.eq("spend")].groupby("tag").amount.sum()

tag
benefits              0.000000
communication     97419.421875
earnings              0.000000
finance          466803.312500
hobbies           49798.539062
household        972438.000000
motor             93256.523438
other_income          0.000000
other_spend      383822.281250
pensions              0.000000
retail           130843.078125
savings               0.000000
services         584479.625000
transfers             0.000000
travel           194286.828125
Name: amount, dtype: float32

In [5]:
def total_monthly_spend(df):
    group_vars = ["user_id", "ym"]
    mask = df.tag_group.eq("spend")
    s = df[mask].groupby(group_vars).amount.sum().rename("total_monthly_spend")
    return df.merge(s, on=group_vars)


def tag_monthly_spend(df):
    df = df.copy()
    group_vars = ["user_id", "ym", "tag"]
    mask = df.tag_group.eq("spend")
    df["tag"] = df.tag.cat.rename_categories(lambda x: "tag_spend_" + x)
    s = (
        df[mask]
        .groupby(group_vars, observed=True)
        .amount.sum()
        .rename("tag_monthly_spend")
        .unstack()
    )
    return df.merge(s, on=["user_id", "ym"])


def monthly_savings(df, trim_pct=5):
    """Aggregates df into inflows, outflows, and net, by user month, trims at
    specified percentile, and scales flows by user's monthly income.
    """

    def trim_column_values(df, **kwargs):
        return df.apply(hd.trim, **kwargs)

    mask = df.account_type.eq("savings") & ~df.tag_auto.str.contains(
        "interest", na=False
    )
    df["debit"] = df.debit.replace({True: "debit", False: "credit"})
    s = (
        df[mask]
        .groupby(["user_id", "ym", "income", "debit"])
        .amount.sum()
        .abs()
        .unstack()
        .reset_index("income")
        .assign(
            sa_inflows=lambda df: df.credit / (df.income / 12) * 100,
            sa_outflows=lambda df: df.debit / (df.income / 12) * 100,
        )
        .fillna(0)
        .assign(sa_inflows_net=lambda df: df.sa_inflows - df.sa_outflows)
        .drop(columns=["income", "debit", "credit"])
        .pipe(trim_column_values, pct=trim_pct)
    )
    return df.merge(s, how="inner", on=["user_id", "ym"])


def make_analysis_data(df):
    df = df.copy()
    # add variables - temp
    df = df.pipe(total_monthly_spend).pipe(tag_monthly_spend).pipe(monthly_savings)

    tag_spend = df.columns[df.columns.str.startswith("tag_spend")]

    # collapse to user-ym
    cols = [
        "sa_inflows",
        "sa_inflows_net",
        "entropy_sptac",
        "income",
        "total_monthly_spend",
    ] + list(tag_spend)
    return df.groupby(["user_id", "ym"]).first()[cols]


analysis_data = make_analysis_data(dfs)
hd.inspect(analysis_data)

(486, 14)


Unnamed: 0_level_0,Unnamed: 1_level_0,sa_inflows,sa_inflows_net,entropy_sptac,income,total_monthly_spend,tag_spend_other_spend,tag_spend_services,tag_spend_finance,tag_spend_communication,tag_spend_motor,tag_spend_household,tag_spend_retail,tag_spend_travel,tag_spend_hobbies
user_id,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14777,201212,,,2.648409,14587.120117,1072.329956,240.0,19.299999,41.18,117.93,,376.450012,237.470001,40.0,
14777,201301,7.157727,-10.73659,2.539835,16765.099609,986.669983,80.0,90.169998,52.77,86.580002,,353.25,-26.0,349.899994,


## Main results

Full FE equation as baseline

## Lagged-dependent variable