Notebook purpose

- Document problems in MBD raw data

In [4]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('/Users/fgu/dev/projects/entropy')
import entropy.helpers.aws as aws
import entropy.data.cleaners as cl

sns.set_style('whitegrid')
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)
pd.set_option('max_colwidth', None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

In [2]:
m = aws.S3BucketManager('3di-project-entropy')
m.list()

['3di-project-entropy/entropy_000.parquet',
 '3di-project-entropy/entropy_777.parquet',
 '3di-project-entropy/entropy_X77.parquet']

## Debits as income

Classifying income following hacioglu2020distributional results in a substantial number of debits classified as income. They all belong to one of three auto tags, which we'll investigate below.

In [5]:
SAMPLE = 'X77'
fp = f'~/tmp/entropy_{SAMPLE}.parquet'
df = aws.read_parquet(fp)

In [6]:
mask = df.tag_group.eq('income') & df.amount.gt(0)
d = df[mask]
d.amount.describe()

count     1582.000000
mean       311.669220
std       1604.520386
min          0.100000
25%         10.410000
50%         25.000000
75%        125.180000
max      29000.000000
Name: amount, dtype: float64

In [8]:
d.tag_auto.value_counts()[:5]

pension or investments    1379
interest income            177
unsecured loan funds        26
pet - everyday or food       0
payday loan                  0
Name: tag_auto, dtype: int64

## Pension or investments

### Problem

This key contains both pension income (when amount is negative) and -- mostly -- pension contributions (when amount is positive)

In [40]:
mask = df.tag_auto.eq('pension or investments') & df.amount.lt(0)
d = df[mask]
print(d.shape)
d.head(2)

(12, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
115099,164193570,2016-10-11,65677,-111.309998,john lewis pension,john lewis,income,pensions,False,ng9 6,2014-06-02,,1959.0,2016-10-09,612941,2020-03-11 11:12:00,halifax personal banking,current,2016-10-12,2018-04-02,False,20667.589844,john lewis other,pension or investments,,work pension,u,201610,15557.87207,57253.828125,False
115146,169774214,2016-11-10,65677,-111.510002,john lewis pension,john lewis,income,pensions,False,ng9 6,2014-06-02,,1959.0,2016-10-09,612941,2020-03-11 11:12:00,halifax personal banking,current,2016-11-11,2018-04-02,False,20667.589844,john lewis other,pension or investments,,refunded purchase,u,201611,15236.822266,57253.828125,False


In [41]:
mask = df.tag_auto.eq('pension or investments') & df.amount.gt(0)
d = df[mask]
print(d.shape)
d.head(2)

(1379, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
15248,3407594,2013-01-28,3277,500.0,www.hl.co.uk <mdbremoved>,hargreaves lansdown,income,pensions,True,sw19 1,2012-09-26,,1987.0,2012-09-26,253764,2015-03-16,first direct,current,2014-07-18,2017-12-04,True,,hargreaves lansdown,pension or investments,,,u,201301,,32259.191406,False
15358,4112473,2013-03-07,3277,50.0,hlam regular savin,hargreaves lansdown,income,pensions,True,sw19 1,2012-09-26,,1987.0,2012-09-26,253764,2015-03-16,first direct,current,2014-07-18,2020-03-17,True,,hargreaves lansdown,pension or investments,,investment - other,u,201303,,32259.191406,False


### Solution

Classify debits and credits separately

## Interest income

### Problem

Most txns labelled as interest income clearly are

In [12]:
mask = df.tag_auto.eq('interest income') & df.amount.lt(0)
d = df[mask]
print(d.shape)
d.head(2)

(5122, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
1732,43934679,2014-09-29,777,-1.4,interest added - gross interest £1.75 tax deducted £0.35,,income,other_income,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262912,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,43.450001,account provider,interest income,,interest income,u,201409,842.109985,26204.169922,False
3044,98389865,2015-09-29,777,-1.34,interest added - gross interest £1.68 tax deducted £0.34,,income,other_income,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262912,2016-07-24 15:29:00,nationwide,current,2015-10-02,2017-08-12,False,43.450001,account provider,interest income,,interest income,u,201509,43.450012,27638.970703,False


Yet a small minority, those that are debits, are actually overdraft fees

In [14]:
mask = df.tag_auto.eq('interest income') & df.amount.gt(0)
d = df[mask]
print(d.shape)
d.head(2)

(177, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
71235,806079392,2017-07-15,35177,1.99,overdraft interestto 23jun2017,,income,other_income,False,xxxx 0,2014-02-14,20k to 30k,1990.0,2017-05-25,724235,2020-08-14 20:59:00,hsbc,current,2020-08-12,1900-01-01,True,844.299988,account provider,interest income,,interest income,c,201707,1269.640869,26090.759766,False
71341,806079376,2017-08-14,35177,7.84,overdraft interestto 23jul2017,,income,other_income,False,xxxx 0,2014-02-14,20k to 30k,1990.0,2017-05-25,724235,2020-08-14 20:59:00,hsbc,current,2020-08-12,1900-01-01,True,844.299988,account provider,interest income,,interest income,c,201708,805.120911,26090.759766,False


In [15]:
d.desc.str[:18].value_counts(dropna=False)

overdraft interest    177
Name: desc, dtype: int64

### Solution

Classify debits and credits separately

## Unsecured loan funds

Should loan funds be classified as income to start with?

### Problem

How to classify loans (also payday loans in general?)

In [17]:
mask = df.tag_auto.eq('unsecured loan funds') & df.amount.lt(0)
d = df[mask]
print(d.shape)
d.head(2)

(413, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
19575,2120049,2012-11-28,4277,-10000.0,<mdbremoved>,,income,other_income,False,kt15 2,2012-10-07,,1959.0,2012-10-07,110411,2014-02-17 11:02:00,first direct,current,2014-07-18,2015-03-19,False,-1038.719971,,unsecured loan funds,,,u,201211,8315.09082,74063.592188,False
25170,20858977,2014-04-15,14577,-7500.0,<mdbremoved>,,income,other_income,False,pr2 6,2013-01-06,,1984.0,2014-05-26,104213,2015-01-04 00:00:00,santander,current,2014-07-18,2015-03-19,False,,,unsecured loan funds,,unsecured loan funds,u,201404,,37589.246652,False


In [16]:
mask = df.tag_auto.eq('unsecured loan funds') & df.amount.gt(0)
d = df[mask]
print(d.shape)
d.head(2)

(26, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
971778,603826282,2019-02-25,513377,186.020004,"direct debit payment to aa loans ref xxxxxxxxxxxx0661, mandate no 0033",aa finance,income,other_income,,g61 4,2019-05-24,20k to 30k,1977.0,2019-09-03,1414589,2020-08-16 11:32:00,santander,current,2019-09-04,1900-01-01,True,261.730011,aa finance,unsecured loan funds,,unsecured loan funds,c,201902,-2556.750977,25555.388672,False
971936,604129805,2019-03-25,513377,186.020004,"direct debit payment to aa loans ref xxxxxxxxxxxx0661, mandate no 0033",aa finance,income,other_income,,g61 4,2019-05-24,20k to 30k,1977.0,2019-09-03,1414589,2020-08-16 11:32:00,santander,current,2019-09-04,1900-01-01,True,261.730011,aa finance,unsecured loan funds,,unsecured loan funds,c,201903,-2566.330811,25555.388672,False


## Untagged transfers

In [None]:
df = pd.read_parquet('~/tmp/entropy_X77.parquet')

### Problem

There are txns that are clearly transfers but aren't tagged as such ('tranfsers' is only the most obvious strings, the same is true for others like 'xfer', 'trf').

In [29]:
untagged_tfrs = df[df.desc.str.contains('transfer') & df.tag.isna()]
untagged_tfrs.head()

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,ym,balance,income
1625,43934685,2014-08-31,777,-400.0,transfer from <mdbremoved>,,,,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262914,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,1637.530029,,,,,201408,2565.380371,158.399998
1626,43934684,2014-08-31,777,-300.0,transfer from <mdbremoved>,,,,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262914,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,1637.530029,,,,,201408,2565.380371,158.399998
1738,43934696,2014-09-30,777,-400.0,transfer from <mdbremoved>,,,,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262914,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,1637.530029,,,,,201409,3166.390381,158.399998
1739,43934695,2014-09-30,777,-300.0,transfer from <mdbremoved>,,,,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262914,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,1637.530029,,,,,201409,3166.390381,158.399998
1845,43934706,2014-11-02,777,-400.0,transfer from <mdbremoved>,,,,False,wa1 4,2011-07-20,20k to 30k,1969.0,2014-11-19,262914,2016-07-24 15:29:00,nationwide,current,2014-11-19,2017-08-13,False,1637.530029,,,,,201411,3631.200439,158.399998


### Questions for MDB

- There are txns for which the above strings are part of the description and that are tagged as tranfsers, indicating that it's not simply that strings are missing from classification regex. Are there reasone the above aren't classified or is this a bug?

### Solution

## Auto purpose tag inconsistency

Auto purpose tag should equal manual tag if manual tag is not missing and else equal Auto Purpose Tag. There are many cases where this is not the case.

### Case 1: incorrectly empty user precedence tag

In [3]:
df = aws.s3read_parquet('s3://3di-data-mdb/raw/mdb_777.parquet')
df.head(1)

Unnamed: 0,Transaction Reference,User Reference,User Registration Date,Year of Birth,Salary Range,Postcode,LSOA,MSOA,Derived Gender,Transaction Date,Account Reference,Provider Group Name,Account Type,Latest Recorded Balance,Transaction Description,Credit Debit,Amount,User Precedence Tag Name,Manual Tag Name,Auto Purpose Tag Name,Merchant Name,Merchant Business Line,Account Created Date,Account Last Refreshed,Data Warehouse Date Created,Data Warehouse Date Last Updated,Transaction Updated Flag
0,688293,777,2011-07-20,1969.0,20K to 30K,WA1 4,E01012553,E02002603,M,2012-01-25,262916,NatWest Bank,Current,364.220001,"9572 24jan12 , tcs bowdon , bowdon gb - pos",Debit,25.030001,No Tag,No Tag,No Tag,No Merchant,Unknown Merchant,2011-07-20,2020-07-21 20:32:00,2014-07-18,2017-10-24,U


In [73]:
tag_names = ['User Precedence Tag Name', 'Manual Tag Name', 'Auto Purpose Tag Name']
tags = df[tag_names]

mask = ((tags['User Precedence Tag Name'] == 'No Tag')
        & ((tags['Auto Purpose Tag Name'] != 'No Tag') 
           | (tags['Manual Tag Name'] != 'No Tag')))
errors = tags[mask]
errors.head(3)

Unnamed: 0,User Precedence Tag Name,Manual Tag Name,Auto Purpose Tag Name
33,No Tag,No Tag,Cash
36,No Tag,No Tag,Interest charges
37,No Tag,No Tag,Lunch or Snacks


In [74]:
print(f'Tags are incorrect in {len(errors) / len(df):.1%} percent of observations.')

Tags are incorrect in 8.9% percent of observations.


### Case 2: incorrectly empty manual and auto purpose tag

In [76]:
mask = ((tags['User Precedence Tag Name'] != 'No Tag')
        & (tags['Auto Purpose Tag Name'] == 'No Tag') 
        & (tags['Manual Tag Name'] == 'No Tag'))
errors = tags[mask]
errors.head(2)

Unnamed: 0,User Precedence Tag Name,Manual Tag Name,Auto Purpose Tag Name
507,Financial - other,No Tag,No Tag
590,Water,No Tag,No Tag


In [77]:
print(f'Tags are incorrect in {len(errors) / len(df):.1%} percent of observations.')

Tags are incorrect in 0.4% percent of observations.


### Correction

In [6]:
def correct_tag_up(df):
    """Set tag_up to tag_manual if tag_manual not missing else to tag_auto.
    
    This definition of tag_up is violated in two ways: sometimes tag_up is
    missing while one of the other two tags isn't, sometimes tag_up is
    not missing but both other tags are. In the latter case, we leave tag_up
    unchanged.
    """
    correct_up_value = df.tag_manual.fillna(df.tag_auto)
    df['tag_up'] = (df.tag_up.where(df.tag_up.notna(), correct_up_value))
    return df