Notebook purpose

- Develop analysis code

In [1]:
import contextlib
import math
import os
import re
import sys

import linearmodels as lm
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.analysis.helpers as ah
import entropy.analysis.make_analysis_data as ad
import entropy.analysis.sumstats_table as ss
import entropy.data.cleaners as cl
import entropy.data.selectors as sl
import entropy.figures.figures as ff
import entropy.figures.helpers as fh
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

SAMPLE = "XX7"

In [2]:
import warnings

warnings.filterwarnings("once")

In [6]:
def binary_inflows(df):
    df["has_sa_inflows"] = df.sa_inflows.gt(0).astype(int)
    return df


def log_income(df):
    for col in ["monthly_income", "annual_income"]:
        df["_".join(["log", col])] = df[col].where(df[col] == 0, np.log(df[col]))
    return df


def time_as_month(df):
    return (
        df.reset_index("date")
        .assign(month=lambda df: df.date.dt.month)
        .set_index("month", append=True)
    )


def standardise_entropy(df):
    df["entropy_sptac_std"] = (
        df.entropy_sptac - df.entropy_sptac.mean()
    ) / df.entropy_sptac.std()
    return df


df = (
    hd.read_analysis_data(SAMPLE)
    .pipe(binary_inflows)
    .pipe(log_income)
    .pipe(time_as_month)
    .pipe(standardise_entropy)
)

hd.inspect(df)

Time for read_analysis_data            : 1.79 seconds
(84,096, 33)


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,date,active_accounts,txns_count,txns_value,txn_count_sa,txn_count_ca,monthly_spend,log_monthly_spend,prop_spend_services,prop_spend_other_spend,prop_spend_household,prop_spend_travel,prop_spend_communication,prop_spend_finance,prop_spend_motor,prop_spend_retail,prop_spend_hobbies,monthly_income,annual_income,entropy_sptac,region,age,female,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,has_sa_inflows,log_monthly_income,log_annual_income,entropy_sptac_std
user_id,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
37,2,2012-02-29,"[287407, 287405, 287406, 287404]",22,9495.810547,3.0,8.0,528.23999,6.269551,0.224898,0.088596,0.591852,0.0,0.094654,0.0,0.0,0.0,0.0,1687.530029,18768.632812,2.899397,Scotland,27.0,0.0,3340.0,1740.0,1600.0,2.135478,1.112494,1.022983,1,7.431021,9.839942,1.459064
37,3,2012-03-31,"[287405, 287407, 287406, 287404]",16,2147.699951,2.0,9.0,429.880005,6.063506,0.028799,0.093049,0.761841,0.0,0.116312,0.0,0.0,0.0,0.0,1085.790039,18768.632812,3.039149,Scotland,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6.990063,9.839942,2.070292




### Binary outcomes

In [41]:
data = df

y = data["has_sa_inflows"]
prop_spends = list(data.filter(like="prop_spend").columns)[:-1]
other_controls = ["log_monthly_spend", "log_annual_income", "log_monthly_income"]
var_of_interest = ["entropy_sptac_std"]
exog = data[var_of_interest + other_controls + prop_spends]

results = lm.panel.compare(
    [
        ah.fe_reg(
            y,
            data[var_of_interest],
            entity_effects=True,
            time_effects=True,
        ),
        ah.fe_reg(
            y,
            data[var_of_interest + ["log_monthly_spend"]],
            entity_effects=True,
            time_effects=True,
        ),
        ah.fe_reg(
            y,
            data[var_of_interest + ["log_annual_income"]],
            entity_effects=True,
            time_effects=True,
        ),
        ah.fe_reg(
            y,
            data[var_of_interest + ["log_monthly_income"]],
            entity_effects=True,
            time_effects=True,
        ),
        ah.fe_reg(
            y,
            data[var_of_interest + prop_spends],
            entity_effects=True,
            time_effects=True,
        ),
        ah.fe_reg(y, exog, entity_effects=True, time_effects=True),
        ah.fe_reg(y, exog, entity_effects=True, time_effects=False),
        ah.fe_reg(y, exog, entity_effects=False, time_effects=True),
        ah.fe_reg(y, exog, entity_effects=False, time_effects=False),
    ],
    stars=True,
)
results

  vals = concat(


0,1,2,3,4,5,6,7,8,9
,Model 0,Model 1,Model 2,Model 3,Model 4,Model 5,Model 6,Model 7,Model 8
Dep. Variable,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows
Estimator,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS
No. Observations,84096,84096,84096,84096,84096,84096,84096,84096,84096
Cov. Est.,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered
R-squared,0.0011,0.0081,0.0056,0.0037,0.0025,0.0127,0.0128,0.0036,0.5219
R-Squared (Within),0.0011,0.0081,0.0056,0.0037,0.0025,0.0128,0.0128,0.0050,0.0048
R-Squared (Between),-0.0010,0.7090,-0.0501,0.2702,-0.4506,-0.1107,-0.1079,0.1359,0.7180
R-Squared (Overall),-0.0002,0.5172,-0.1462,0.2054,-0.3412,-0.1986,-0.1962,0.1045,0.5219
F-statistic,92.729,332.31,230.08,152.15,22.203,87.378,87.524,25.158,7648.0


In [46]:
results

0,1,2,3,4,5,6,7,8,9
,Model 0,Model 1,Model 2,Model 3,Model 4,Model 5,Model 6,Model 7,Model 8
Dep. Variable,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows,has_sa_inflows
Estimator,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS,PanelOLS
No. Observations,84096,84096,84096,84096,84096,84096,84096,84096,84096
Cov. Est.,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered,Clustered
R-squared,0.0011,0.0081,0.0056,0.0037,0.0025,0.0127,0.0128,0.0036,0.5219
R-Squared (Within),0.0011,0.0081,0.0056,0.0037,0.0025,0.0128,0.0128,0.0050,0.0048
R-Squared (Between),-0.0010,0.7090,-0.0501,0.2702,-0.4506,-0.1107,-0.1079,0.1359,0.7180
R-Squared (Overall),-0.0002,0.5172,-0.1462,0.2054,-0.3412,-0.1986,-0.1962,0.1045,0.5219
F-statistic,92.729,332.31,230.08,152.15,22.203,87.378,87.524,25.158,7648.0


## Variance decomposition

Between individuals

In [34]:
def var_decomposition(df, var):
    between = df.groupby("user_id")[var].mean().var()
    within = df.groupby("user_id")[var].var().mean()
    total = df[var].var()
    return total, between + within, between, within


var_decomposition(df, "entropy_sptac")

(0.052276307616909094,
 0.05347219866315492,
 0.027731256685267578,
 0.025740941977887344)