In [65]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('/Users/fgu/dev/projects/entropy')
import entropy.helpers.aws as aws
import entropy.data.cleaners as cl
import entropy.data.helpers as dh

sns.set_style('whitegrid')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 120)
pd.set_option('max_colwidth', None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Notebook purpose

- Understand nature of duplicate transactions and explore solutions

Duplicate types:

1. `['user_id', 'date', 'amount', 'account_id', 'desc']` are identical.
    - Reasons for false positives (FP): user makes two identical transactions on the same day (or on subsequent days for txns that appear with a delay).
    - FP examples: TFL, coffee shops, betting shops.


2. `['user_id', 'date', 'amount', 'account_id']` are identical, `desc` is different, but one `desc` is a truncated version of the other.
    - FB reasons: same as above possible, but less likely. Seems more likely that they are actual duplicates created by an updating process (e.g. newer version of a txn with less redacted `desc` gets added without old version being removed, and `updated_flag` is incorrect.


3. `['user_id', 'date', 'amount', 'account_id']` are identical, `desc` is different.
    - FP reasons: above and user makes otherwise identical txns with different merchants.


4. All of the above iterations but without imposing identical account id. This is relevant if there are (many) duplicated accounts, in which case a different account number is no guarantee for a different account.

In [2]:
m = aws.S3BucketManager('3di-project-entropy')
m.list()

['3di-project-entropy/entropy_000.parquet',
 '3di-project-entropy/entropy_777.parquet',
 '3di-project-entropy/entropy_X77.parquet']

In [3]:
df = aws.s3read_parquet('s3://3di-project-entropy/entropy_X77.parquet')

In [53]:
def dup_txn_sample(df, col_subset, n=100):
    """Draws sample of size n of duplicate txns as defined by col_subset."""
    dups = df[df.duplicated(subset=col_subset, keep=False)].copy()
    dups['group'] = dups.groupby(col_subset).ngroup()
    unique_groups = np.unique(dups.group)
    rng = np.random.default_rng()
    sample = rng.choice(unique_groups, size=n)
    return dups[dups.group.isin(sample)]

## Processing

Delete type 1 duplicates

In [50]:
dup1_subset = ['user_id', 'date', 'amount', 'account_id', 'desc']
df = df.drop_duplicates(subset=dup1_subset)

Delete perfect substrings

In [223]:
dd

Unnamed: 0,score_difflib,score_fuzz,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,dup2,group
535728,,,421314426,2018-06-29,361277,20.0,cash natwest jun29 - watford junc@xx:xx - atm,,spend,other_spend,False,e11 3,2016-10-23,20k to 30k,1984.0,2017-03-15,685641,2020-07-01 12:00:00,hsbc,current,2018-07-01,2019-05-06,True,108.830002,personal,cash,,cash,u,201806,5904.541016,28308.679688,False,10208
535732,0.888889,92.0,646246462,2018-06-29,361277,20.0,cash natwest jun29watford junc@xx:xx,,spend,other_spend,False,e11 3,2016-10-23,20k to 30k,1984.0,2017-03-15,685641,2020-07-01 12:00:00,hsbc,current,2019-11-18,2019-12-18,True,108.830002,personal,cash,,cash,u,201806,5904.541016,28308.679688,True,10208


In [208]:
import difflib
import functools
import collections

from fuzzywuzzy import fuzz

DescAndId = collections.namedtuple('DescAndID', ['desc', 'id'])
longest_first = functools.partial(sorted, key=lambda x: len(x.desc), reverse=True)

def similarity_score(group):
    """Return similarity score between longest string in group and all others."""
    cols = list(group.columns)
    group['score_difflib'] = np.nan
    group['score_fuzz'] = np.nan
    items = [DescAndId(*item) for item in zip(group.desc, group.id)]
    longest, *others = longest_first(items)
    for o in others:
        group.loc[group.id == o.id, 'score_difflib'] = difflib.SequenceMatcher(None, longest.desc, o.desc).ratio()
        group.loc[group.id == o.id, 'score_fuzz'] = fuzz.partial_ratio(longest.desc, o.desc)
    return group[['score_difflib', 'score_fuzz'] + cols]

In [302]:
dd = dup_txn_sample(df, dup_subset, n=1).groupby('group').apply(similarity_score)
dd

Unnamed: 0,score_difflib,score_fuzz,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,dup2,group
71778,0.590909,45.0,295333039,2017-12-18,35177,20.0,cargo - london - ))),cargo home shop,spend,household,False,xxxx 0,2014-02-14,20k to 30k,1990.0,2017-05-25,724235,2020-08-14 20:59:00,hsbc,current,2017-12-25,2019-09-02,True,844.299988,cargo home shop,home,dining and drinking,dining and drinking,u,201712,396.441101,19975.649414,False,1015
71800,,,806079270,2017-12-18,35177,20.0,cargo london,cargo home shop,spend,household,False,xxxx 0,2014-02-14,20k to 30k,1990.0,2017-05-25,724235,2020-08-14 20:59:00,hsbc,current,2020-08-12,1900-01-01,True,844.299988,cargo home shop,home,dining and drinking,dining and drinking,c,201712,396.441101,19975.649414,True,1015


- lowest same: .82 most, 72
- highest different: .71

Cases for decision:
- One mdbremoved, the other unremoved has very low score -> treat as dup or not? (could use 100 fuzz score to exclude)
- daily od fee _date_ only differs in date and has very high score but is different - exclude manually or handle automatically. some other descs that contain differing dates. could match dates and ensure they differ
- remove `-`, `)` and `(`

In [305]:
dd.desc.str.strip

<bound method StringMethods.normalize of <pandas.core.strings.accessor.StringMethods object at 0x17c3c1490>>

## Type 3 dups

In [51]:
dup_subset = ['user_id', 'date', 'amount', 'account_id']
dup_var = 'dup2'
df[dup_var] = df.duplicated(subset=dup_subset)

In [101]:
from fuzzywuzzy import fuzz
from functools import partial

longest_first = partial(sorted, key=lambda x: len(x), reverse=True)

for idx, data in dd.groupby('group'):
    longest, *others = longest_first(data.desc.values)
    print(longest)
    for other in others:
        print('   {}'.format(other))
        print('   {}'.format(fuzz.partial_ratio(longest, other)), end='\n\n')


lnk sk store, 44/4 cd 8050 12jul15
   sby tamworth cd 8050 13jul15
   68

lnk sk store, 44/4 cd 8050 13dec15
   lnk star news coto cd 9447 12dec15
   59

the boathouse bras cd 4720 deb
   tesco stores 6711 cd 4720 deb
   62

<mdbremoved>
   <mdbremoved>
   100

   <mdbremoved>
   100

<mdbremoved> xxxxxx xxxx5560
   <mdbremoved>
   100

bank credit <mdbremoved>
   bank credit <mdbremoved>
   100

xxxxxx xxxx0290 internet transfer
   xxxxxx xxxx8658 internet transfer
   88

32 red cd 7512 deb
   32 red cd 7512 deb
   100

non-stg purch fee cd 6710 deb
   non-stg purch fee cd 6710 deb
   100

   non-stg purch fee cd 6710 deb
   100

   non-stg purch fee cd 6710 deb
   100

card payment to iz *canopy market,2.00 gbp, rate 1.00/gbp on 10-07-2020
   card payment to iz *crosstown,2.00 gbp, rate 1.00/gbp on 10-07-2020
   85



## Type 1 dups

In [29]:
dup_subset = ['user_id', 'date', 'amount', 'account_id', 'desc']
dup_var = 'dup1'

df[dup_var] = df.duplicated(subset=dup_subset)

### Prevalence and value

How prevalent are duplicates?

In [19]:
n_df = len(df)
n_dups = len(df[df[dup_var]])
n_users_dups = df[df[dup_var]].user_id.nunique()
n_users_df = df.user_id.nunique()
txt = 'About {:.1%} of transactions across {:.0%} of users are potential dups.'
print(txt.format(n_dups / n_df, n_users_dups / n_users_df))

About 1.7% of transactions across 97% of users are potential dups.


Gross value of duplicated txns

In [39]:
def distr(x):
    pcts = [.01, .05, .1, .25, .50, .75, .90, .95, .99]
    return x.describe(percentiles=pcts).round(2)

gross_value = df[df[dup_var]].set_index('user_id').amount.abs().groupby('user_id').sum()
distr(gross_value)

count       415.00
mean       4459.53
std       14957.93
min           1.00
1%            4.54
5%           20.46
10%          61.97
25%         237.31
50%         830.10
75%        2647.96
90%        8980.22
95%       16434.12
99%       59013.14
max      183754.34
Name: amount, dtype: float64

Most frequent txns description

In [41]:
df[df[dup_var]].desc.value_counts(dropna=False)[:10]

<mdbremoved>                       1962
<mdbremoved>                        516
<mdbremoved> ft                     359
b365 moto                           263
paypal payment                      195
tfl travel charge tfl.gov.uk/cp     167
www.skybet.com cd 9317              165
<mdbremoved> so                     157
betfair.-purchase                   146
<mdbremoved> - s/o                  143
Name: desc, dtype: int64

In [43]:
df[df[dup_var]].tag_auto.value_counts(dropna=False)[:10]

NaN                         6104
transfers                   3260
gambling                    2273
enjoyment                   1617
public transport            1132
lunch or snacks             1019
bank charges                 862
entertainment, tv, media     556
cash                         520
dining or going out          507
Name: tag_auto, dtype: int64

In [26]:
(df[df[dup_var]].tag_auto.value_counts(dropna=False) / df.tag_auto.value_counts(dropna=False)).sort_values(ascending=False)[:20]

investment - other               0.227723
gambling                         0.162822
mobile app                       0.147576
isa                              0.090024
tradesmen fees                   0.062500
flights                          0.050548
parking                          0.046441
payment protection insurance     0.044776
paypal account                   0.044444
bills                            0.044291
home appliance insurance         0.039422
games and gaming                 0.037543
pension or investments           0.035945
supermarket                      0.035669
road charges                     0.030667
pet insurance                    0.029184
bank charges                     0.028319
public transport                 0.027676
refunded purchase                0.025626
child - everyday or childcare    0.024873
Name: tag_auto, dtype: float64

### Inspect dups

In [27]:
with pd.option_context('display.max_rows', 1):
    display(dup_txn_sample(df, dup_subset))

Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,dup1,group
35915,141212619,2016-06-01,18377,1.5,waitrose cd 8050,waitrose,spend,household,False,b79 7,2013-03-03,,1977.0,2013-03-03,258653,2017-01-19 06:13:00,lloyds bank,current,2016-06-03,2017-08-11,True,,waitrose,"food, groceries, household",,groceries,u,201606,,3301.319824,False,208


## Case studies

In [None]:
dh.user_date_data(df, 35177, '1 Jan 2020')

In [None]:
dh.user_date_data(df, 362977, '1 Jan 2020')

In [None]:
dh.user_date_data(df, 467877, '1 Jan 2020')