In [1]:
import sys

import numpy as np
import pandas as pd
import s3fs
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.analysis.make_analysis_data as ad
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.data.validators as vl
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh

pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [2]:
def user_period_data(df, user_id, period):
    return (
        df.set_index("user_id", drop=False)
        .loc[user_id]
        .set_index("date", drop=False)
        .loc[period]
        .reset_index(drop=True)
    )

## Read data

In [113]:
df = hd.read_sample("XX7")
hd.inspect(df)

Time for read_sample    : 3.41 minutes
(8,666,730, 29)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,debit,female,latest_balance,merchant_business_line,postcode,tag_auto,tag_manual,tag_up,ym,yob,balance,income,age,entropy_sptac,region
0,394187,2012-02-01,37,3.14,steampowered.com,steam,spend,services,2010-07-01,2016-10-30,287407,2013-12-15 11:09:00,royal bank of scotland (rbs),credit card,True,0.0,-125.849998,steam,ab33 8,games and gaming,,,201202,1985.0,-28.130081,18768.632812,27.0,2.899397,Scotland
1,394188,2012-02-01,37,-227.350006,direct debit payment - th,,transfers,,2010-07-01,2016-10-30,287407,2013-12-15 11:09:00,royal bank of scotland (rbs),credit card,False,0.0,-125.849998,account provider,ab33 8,credit card,,credit card payment,201202,1985.0,-28.130081,18768.632812,27.0,2.899397,Scotland


In [197]:
is_sa = df.account_type.eq("savings")
k = is_sa.groupby(df.user_id).sum() == 0
k[k]

user_id
25687     True
29527     True
475947    True
Name: account_type, dtype: bool

## Pipeline

In [99]:
clean = md.clean_data(dfr)

Time for rename_cols                   : 0.03 seconds
Time for clean_headers                 : 0.00 seconds
Time for drop_unneeded_vars            : 0.03 seconds
Time for add_year_month_variable       : 0.05 seconds
Time for drop_first_and_last_month     : 0.06 seconds
Time for lowercase_categories          : 0.96 seconds
Time for drop_missing_txn_desc         : 0.05 seconds
Time for gender_to_female              : 0.03 seconds
Time for credit_debit_to_debit         : 0.01 seconds
Time for sign_amount                   : 0.00 seconds
Time for missings_to_nan               : 0.01 seconds
Time for zero_balances_to_missing      : 0.00 seconds
Time for add_tag                       : 0.33 seconds
Time for tag_corrections               : 0.56 seconds
Time for add_tag_group                 : 0.12 seconds
Time for drop_type1_dups               : 0.30 seconds
Time for order_and_sort                : 0.15 seconds
Time for clean_data                    : 2.75 seconds


In [102]:
created = md.create_vars(clean)

Time for balances                      : 2.83 seconds
Time for income                        : 0.33 seconds
Time for age                           : 0.06 seconds
Time for entropy_spend_tag_counts      : 0.30 seconds
Time for region_name                   : 0.70 seconds
Time for create_vars                   : 4.24 seconds


In [111]:
selected = md.select_sample(created)

Time for select_sample                 : 0.42 seconds


In [112]:
validated = md.validate_data(selected)

All validation checks passed.
Time for validate_data                 : 0.10 seconds


dft = hd.read_sample("777")
hd.inspect(dft)

## Dev

## Test analysis data

In [128]:
clean

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_registration_date,account_created,account_id,account_last_refreshed,account_provider,account_type,debit,female,latest_balance,merchant_business_line,postcode,tag_auto,tag_manual,tag_up,ym,yob
9,688300,2012-02-01,777,400.000000,<mdbremoved> - s/o,,transfers,other_transfers,2011-07-20,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,True,0.0,364.220001,non merchant mbl,wa1 4,transfers,other account,other account,201202,1969.0
10,688299,2012-02-01,777,3.030000,aviva pa - d/d,aviva,spend,finance,2011-07-20,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,True,0.0,364.220001,aviva,wa1 4,health insurance,life insurance,life insurance,201202,1969.0
11,688301,2012-02-03,777,8.750000,chart ins log tran - d/d,,,,2011-07-20,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,True,0.0,364.220001,,wa1 4,,memberships,memberships,201202,1969.0
12,688303,2012-02-03,777,0.990000,"9572 02feb12 , apple itunes store, gbp , london gb - pos",apple,spend,services,2011-07-20,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,True,0.0,364.220001,apple,wa1 4,"entertainment, tv, media",,mobile app,201202,1969.0
13,688302,2012-02-03,777,20.000000,"9572 02feb12 , national lottery , inte , watford gb - pos",camelot,spend,services,2011-07-20,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,True,0.0,364.220001,camelot,wa1 4,gambling,,gambling,201202,1969.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681444,802522114,2020-07-28,587777,-95.000000,<mdbremoved> bgc,,spend,retail,2020-06-19,2020-06-19,1711781,2020-08-16 18:36:00,barclays,current,False,0.0,5699.700195,,w7 3,refunded purchase,,refunded purchase,202007,1973.0
681450,802887369,2020-07-28,587777,5.400000,tfl travel charge tfl.gov.uk/cp,tfl,spend,travel,2020-06-19,2020-06-19,1711784,2020-08-16 20:35:00,american express,credit card,True,0.0,-1844.410034,tfl,w7 3,public transport,,public transport,202007,1973.0
681452,802887368,2020-07-28,587777,10.650000,marks & spencer plc st london,marks and spencer,spend,household,2020-06-19,2020-06-19,1711784,2020-08-16 20:35:00,american express,credit card,True,0.0,-1844.410034,marks & spencer dept store,w7 3,"food, groceries, household",,"food, groceries, household",202007,1973.0
681573,803052582,2020-07-31,587777,3.320000,sanef sa stlxxxxx404 ddr,,,,2020-06-19,2020-06-19,1711781,2020-08-16 18:36:00,barclays,current,True,0.0,5699.700195,,w7 3,,,,202007,1973.0


In [36]:
df = hd.read_analysis_data()
hd.inspect(df)

Time for read_analysis_data: 0.78 seconds
(20,035, 32)


Unnamed: 0_level_0,Unnamed: 1_level_0,txn_count_sa,txn_count_ca,balance_ca,balance_sa,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,log_monthly_spend,tag_spend_services,tag_spend_household,tag_spend_travel,tag_spend_motor,tag_spend_other_transfers,tag_spend_finance,tag_spend_other_spend,tag_spend_retail,tag_spend_communication,tag_spend_earnings,tag_spend_benefits,tag_spend_savings,tag_spend_hobbies,tag_spend_other_income,tag_spend_pensions,income,log_income,female,age,region,entropy_sptac
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
107,2014-04-30,0.0,0.0,-1140.444336,700.0,0.0,0.0,0.0,0.0,0.0,0.0,6.078765,0.386698,0.435039,0.020161,0.158102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27530.926758,10.223065,0.0,30.0,South West,2.282854
107,2014-05-31,0.0,93.0,-1140.444336,700.0,0.0,0.0,0.0,0.0,0.0,0.0,7.205212,0.103035,0.474202,0.075385,0.165883,0.0,0.043537,0.059416,0.031372,0.047169,0.0,0.0,0.0,0.0,0.0,0.0,27530.926758,10.223065,0.0,30.0,South West,2.37344


In [37]:
df.isna().sum()

txn_count_sa                   0
txn_count_ca                   0
balance_ca                     0
balance_sa                     0
sa_inflows                     0
sa_outflows                    0
sa_net_inflows                 0
sa_scaled_inflows              0
sa_scaled_outflows             0
sa_scaled_net_inflows          0
log_monthly_spend              0
tag_spend_services             0
tag_spend_household            0
tag_spend_travel               0
tag_spend_motor                0
tag_spend_other_transfers      0
tag_spend_finance              0
tag_spend_other_spend          0
tag_spend_retail               0
tag_spend_communication        0
tag_spend_earnings             0
tag_spend_benefits             0
tag_spend_savings              0
tag_spend_hobbies              0
tag_spend_other_income         0
tag_spend_pensions             0
income                         0
log_income                     0
female                         0
age                            0
region    