Notebook purpose:

- MVP version of eval for codebase testing

In [1]:
import functools
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import scipy
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.aggregators as ag
import entropy.data.cleaners as cl
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.data.validators as vl
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh

import entropy.eval as ev

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2
%matplotlib inline

fs = s3fs.S3FileSystem(profile="3di")

In [2]:
dft = hd.read_txn_data("X77")
hd.inspect(dft)

Time for read_txn_data                 : 52.85 seconds
shape: (6,627,663, 35), users: 2679


Unnamed: 0,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,birth_year,data_warehouse_date_created,data_warehouse_date_last_updated,id,is_debit,is_female,is_sa_flow,is_salary_pmt,is_urban,latest_balance,lsoa,merchant_business_line,msoa,postcode,region_name,salary_range,tag_auto,tag_manual,tag_spend,tag_up,updated_flag,ym
0,2012-08-01,77,12.12,tv licence mbp - d/d,tv licensing,spend,communication,2010-07-10,2012-10-26,259583,2015-09-11,natwest bank,current,1945.0,2014-07-18,2015-03-19,1212601,True,0.0,False,False,1.0,,e01002984,tv licensing,e02000609,kt3 5,london,,tv licence,no tag,"entertainment, tv, media",tv licence,u,2012-08
1,2012-08-01,77,13.81,amazon mktplace pmts amzn.com/billgbr,amazon,spend,services,2010-07-10,2012-10-26,259584,2014-03-06,natwest bank,credit card,1945.0,2014-07-18,2017-08-15,1213850,True,0.0,False,False,1.0,,e01002984,amazon,e02000609,kt3 5,london,,enjoyment,no tag,household,no tag,u,2012-08


## MVP

### Create analysis data

Preprocessing issues:

**Removing outliers**:

- When / how to remove?

    - Ideally only do once for all variables rather than for each variable

- Remove outlier observations or individuals with outlier observations?

    - Former leads to gaps in data
    
    
Data requirements

- Define set of criteria to capture "we see full financial life of user"

- Window length treatment: "see full financial life" for pre and post signup window of n days
    
- Window length control: "see full financial life" for 2n days pre window


In [55]:
aggregators = []


def aggregator(func):
    aggregators.append(func)
    return func


@aggregator
def days_since_reg(df):
    group_vars = [df.user_id, df.ym]
    df["dsreg"] = (df.date - df.user_registration_date).dt.days
    return df.groupby(group_vars).dsreg.mean()


@aggregator
def year_salary(df):
    user_year = [df.user_id, df.date.dt.year]
    user_month = [df.user_id, df.ym]
    salary_pmts = df.amount.where(df.is_salary_pmt, np.nan).abs()
    return (
        salary_pmts.groupby(user_year)
        .transform("sum")
        .groupby(user_month)
        .mean()
        .rename("year_salary")
    )


@aggregator
def sa_inflows(df):
    group_vars = [df.user_id, df.ym]
    is_sa_inflow = df.is_sa_flow & ~df.is_debit
    sa_inflows = df.amount.where(is_sa_inflow, np.nan)
    return sa_inflows.groupby(group_vars).sum().abs().rename("sa_inflows")


def aggregate(df):
    return pd.concat((func(df) for func in aggregators), axis=1)

In [34]:
creators = []


def creator(func):
    creators.append(func)
    return func


@creator
def savings(df):
    """Savings account inflows as a proportion of average monthly income."""
    df["savings"] = df.sa_inflows.div(df.year_salary / 12).replace(np.inf, np.nan)
    return df


def create(df):
    return functools.reduce(lambda df, f: f(df), creators, df)

In [98]:
selectors = []


def selector(func):
    selectors.append(func)
    return func


def user_count(func):
    def wrapper(*args, **kwargs):
        df = func(*args, **kwargs)
        unique_users = df.index.get_level_values("user_id").nunique()
        print(f"{func.__name__:<30}: {unique_users:>4}")
        return df

    return wrapper


@selector
@user_count
def first_counter(df):
    return df


@selector
@user_count
def pre_post_window_length(df, n=180):
    """Retain users with pre and post-registration window of at least n days."""
    g = df.groupby("user_id").dsreg
    cond = g.min().le(-n) & g.max().ge(n)
    users = cond[cond].index
    return df.loc[users]


@selector
@user_count
def min_yearly_income(df, min_salary=10_000):
    """Keep users with yearly incomes no less than min_income."""
    cond = df.groupby("user_id").year_salary.min() >= min_salary
    users = cond[cond].index
    return df.loc[users]


@selector
@user_count
def winsorise_outliers(df, pct=5):
    """Replace outliers in selected cols with specified percentile value."""
    df = df.copy()
    cols = ["month_salary", "year_salary", "sa_inflows", "savings"]
    df[cols] = df[cols].apply(hd.winsorise, pct=pct, how="upper")
    return df


@selector
@user_count
def validators(df):
    assert df.dsreg.min() <= -365
    assert df.dsreg.max() >= 365
    return df


def select(df):
    return functools.reduce(lambda df, f: f(df), selectors, df)

In [99]:
def make_data(df):
    return aggregate(df).pipe(create).pipe(select)


df = make_data(dft)
df.describe()

first_counter                 : 2679
pre_post_window_length        :  600
min_yearly_income             :   53
winsorise_outliers            :   53
validators                    :   53


Unnamed: 0,dsreg,month_salary,year_salary,sa_inflows,savings
count,1670.0,1670.0,1670.0,1670.0,1670.0
mean,16.695775,3539.140137,39478.835938,340.7164,0.148449
std,363.028709,2281.534668,27499.796875,791.32019,0.36487
min,-1071.666667,0.0,10177.860352,0.0,0.0
25%,-226.061106,2009.727539,20574.900391,0.0,0.0
50%,12.164504,3001.76001,30986.279297,0.0,0.0
75%,258.249083,4593.514893,49822.109375,250.0,0.093582
max,1095.076923,9275.209961,117350.210938,3181.391113,1.522262
