Notebook purpose

- Explore possible outcome variables

In [95]:
import os
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import s3fs

sys.path.append('/Users/fgu/dev/projects/entropy')
from entropy import config

import entropy.helpers.aws as ha
import entropy.helpers.helpers as hh 
import entropy.helpers.data as hd
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs

sns.set_style('whitegrid')
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)
pd.set_option('max_colwidth', None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile='3di')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [4]:
def inspect(df, nrows=2):
    print('({:,}, {})'.format(*df.shape))
    display(df.head(nrows))
    
@hh.timer
def read_data(sample):
    fp = f's3://3di-project-entropy/entropy_{sample}.parquet'
    return ha.read_parquet(fp)

dfs = read_data('777')
df = read_data('XX7')
inspect(df)

Time for read_data      : 1.91 seconds
Time for read_data      : 2.93 minutes
(6,791,894, 32)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,savings,balance,income,entropy_sptac
0,19510816,2014-03-07,107,9.2,paypal sandrpowell 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,0.0,,27530.926758,3.019948
1,19510817,2014-03-08,107,6.94,paypal smartbitsso 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,0.0,,27530.926758,3.019948


## Balance and spend groups

In [58]:
def make_reg_data(df):    
    y = df.groupby(['user_id', 'ym']).balance.last()
    y = hd.trim(y)
    X = df.groupby(['user_id', 'ym', 'tag_group'], observed=True).amount.sum().unstack().drop(columns='transfers')
    X['income'] = X.income * -1
    return pd.concat([X, y], axis=1).dropna()

data = make_reg_data(df)
inspect(data)

(65,960, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,spend,income,balance
user_id,ym,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107,201405,1346.430054,2180.290039,-583.911743
107,201406,1821.560059,2664.600098,-1439.041748


In [101]:
pooled_res = lm.PanelOLS.from_formula('balance ~ 1 + spend + income', data).fit()
fe_res = lm.PanelOLS.from_formula('balance ~ 1 + spend + income + EntityEffects + TimeEffects', data).fit()
fe_rescl = lm.PanelOLS.from_formula('balance ~ 1 + spend + income + EntityEffects + TimeEffects', data).fit(
    cov_type="clustered", cluster_entity=True, cluster_time=True
)

In [102]:
print(lm.panel.compare({"Pooled": pooled_res, "FE": fe_res, "FE clustered": fe_rescl}))

                          Model Comparison                         
                                Pooled             FE  FE clustered
-------------------------------------------------------------------
Dep. Variable                  balance        balance       balance
Estimator                     PanelOLS       PanelOLS      PanelOLS
No. Observations                 65960          65960         65960
Cov. Est.                   Unadjusted     Unadjusted     Clustered
R-squared                       0.0211         0.0109        0.0109
R-Squared (Within)              0.0084         0.0113        0.0113
R-Squared (Between)             0.0336         0.0239        0.0239
R-Squared (Overall)             0.0211         0.0180        0.0180
F-statistic                     711.11         350.97        350.97
P-value (F-stat)                0.0000         0.0000        0.0000
Intercept                       961.82         1275.7        1275.7
                              (34.609)       (48