Notebook purpose

- Explore possible outcome variables

In [94]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import s3fs

sys.path.append('/Users/fgu/dev/projects/entropy')
from entropy import config

import entropy.helpers.aws as ha
import entropy.helpers.helpers as hh 
import entropy.helpers.data as hd
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs

sns.set_style('whitegrid')
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)
pd.set_option('max_colwidth', None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile='3di')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [2]:
def inspect(df, nrows=2):
    print('({:,}, {})'.format(*df.shape))
    display(df.head(nrows))
    
@hh.timer
def reader(fp):
    return ha.read_parquet(fp)

SAMPLE = '777'
fp = f's3://3di-project-entropy/entropy_{SAMPLE}.parquet'
dfs = reader(fp)

SAMPLE = 'XX7'
fp = f's3://3di-project-entropy/entropy_{SAMPLE}.parquet'
df = reader(fp)
inspect(df)

Time for reader         : 1.98 seconds
Time for reader         : 3.15 minutes
(6,791,894, 32)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,savings,balance,income,entropy_tag
0,19510816,2014-03-07,107,9.2,paypal sandrpowell 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,0.0,,27530.926758,3.019948
1,19510817,2014-03-08,107,6.94,paypal smartbitsso 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,0.0,,27530.926758,3.019948


## Balance and spend groups

In [91]:
y = dfs.groupby(['user_id', 'ym']).balance.last()
X = dfs.groupby(['user_id', 'ym', 'tag_group'], observed=True).amount.sum().unstack()
data = pd.concat([X, y], axis=1).reset_index()
data.head(2)

Unnamed: 0,user_id,ym,transfers,spend,income,balance
0,777,201201,450.0,785.359985,-1901.329956,-979.716614
1,777,201202,450.0,440.25,-1901.329956,-377.386719


In [92]:
mod = smf.ols(formula='balance ~ transfers + spend + income', data=data)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                balance   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     8.349
Date:                Sun, 28 Nov 2021   Prob (F-statistic):           1.96e-05
Time:                        15:52:17   Log-Likelihood:                -5331.0
No. Observations:                 538   AIC:                         1.067e+04
Df Residuals:                     534   BIC:                         1.069e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2449.5681    532.919      4.597      0.0