Notebook purpose

- Understand nature of duplicate transactions, explore solutions, document decisions about what duplicates to drop

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append('/Users/fgu/dev/projects/entropy')
import entropy.helpers.aws as aws
import entropy.data.cleaners as cl

sns.set_style('whitegrid')
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)
pd.set_option('max_colwidth', None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

### Solution

Types of duplicates

Type 1 duplicates:  `['user_id', 'date', 'amount', 'account_id', 'desc']` are identical

Type 2 duplicates: `['user_id', 'date', 'amount', 'account_id']` are identical and one `desc` is "loose subset" of the other (i.e. each word in one desc appears somewhere in the other, but can be out of order, though each pattern in other txn ).

Approach taken:

- Clean description string to remove extraneous characters that obfuscate type 1 duplicates. 
- Remove type 1 duplicates
- Identify and remove type 2 duplicates

## Solution

In [2]:
df = aws.read_parquet('~/tmp/entropy_777.parquet')

In [10]:
import functools

def _potential_dup2_dups(df):
    cols=['user_id', 'account_id', 'date', 'amount']
    dups = df[df.duplicated(subset=cols, keep=False)].copy()
    dups['group'] = dups.groupby(cols).ngroup()
    return dups

def _identify_dup2(df):
    
    def helper(group):
    
        group['dup'] = False

        DescAndId = collections.namedtuple('DescAndID', ['desc', 'id'])
        shortest_first = functools.partial(sorted, key=lambda x: len(x.desc))

        items = [DescAndId(*item) for item in zip(group.desc, group.id)]
        shortest, *others = shortest_first(items)

        others_are_equal = len(set(others)) == 1
        others_ids = [o.id for o in others]

        if not others_are_equal:
            answer = False
        else:
            remainder = others[0].desc
            for w in shortest.desc.split():            
                if w in remainder:
                    remainder = remainder.replace(w, '', 1)
                else:
                    answer = False
                    break
                answer = True

            if not answer:
                remainder = shortest.desc
                for w in others[0].desc.split():
                    if w in remainder:                    
                        remainder = remainder.replace(w, '', 1)
                    else:
                        answer = False
                        break
                    answer = True

        group.loc[group.id.isin(others_ids), 'dup'] = answer
        return group
    
    return df.groupby('group').apply(helper)



def drop_dup2_old(df):
    df = df.copy()
    dups = _potential_dup2_dups(df)
    dups = _identify_dup2(dups)
    dups = dups[dups.dup].index
    return df.drop(dups)


In [3]:
import collections

def clean_desc(df):
    """Removes extraneous characters that hinder duplicates detection.
    
    Removes common suffixes such as -vis, -p/p, and - e gbp; all
    punctuation; multiple x characters, which are used to mask card
    or account numbers: and extra whitespace. Also splits digits
    suffixes -- but not prefixes, as these are usually dates -- from
    words (e.g. 'no14' becomes 'no 14', '14jan' remains unchanged).
    """
    import string
    df = df.copy()
    kwargs = dict(repl=' ', regex=True)
    df['desc'] = (df.desc.str.replace(r'-\s(\w\s)?.{2,3}$', **kwargs)
                  .str.replace(fr'[{string.punctuation}]+', **kwargs)
                  .str.replace(r'[x]{2,}', **kwargs)
                  .str.replace(r'(?<=[a-zA-Z])(?=\d)', **kwargs)
                  .str.replace(r'\s{2,}', **kwargs)
                  .str.strip())
    return df


def drop_type1_dups(df):
    """Drops Type 1 duplicates.
    
    A Type 1 duplicate is one of two txns with identical user and
    account ids, dates, amounts, and txn descriptions.
    """
    cols = ['user_id', 'account_id', 'date', 'amount', 'desc']
    return df.drop_duplicates(subset=cols)

In [30]:
def _get_potential_type2_dups(df):
    """Returns txns with identical user and account ids, dates, and amounts."""
    cols=['user_id', 'account_id', 'date', 'amount']
    dups = df[df.duplicated(subset=cols, keep=False)].copy()
    dups['group'] = dups.groupby(cols).ngroup()
    return dups

def _identify_type2_dups(df):
    """Returns index of Type2 duplicates."""
    
    def are_identical(items):
        return len(set(items)) == 1

    def each_word_in_string(wordlist, string):
        """Tests whether each word from wordlist appears in string.
        Allows each substring in string to be matched only once.
        """
        unmatched = string
        for w in wordlist:
            if w not in unmatched:
                return False
            unmatched = unmatched.replace(w, '', 1)
        return True
    
    def identifier(g):
        descriptions = [DescId(*i) for i in zip(g.desc, g.id)]
        shortest, *others = sorted(descriptions, key=lambda x: len(x.desc))
        
        if not are_identical(others):
            return g
        
        # for each o in others: check below, and mark o.id as dup if answer is true
        # try to refactor function so I can use in filter, which saves masking step in final return statement.

        wordlist, string = shortest.desc.split(), others[0].desc
        answer = each_word_in_string(wordlist, string)
        if not answer:
            wordlist, string = others[0].desc.split(), shortest.desc
            answer = each_word_in_string(wordlist, string)
        
        others_ids = [o.id for o in others]

        g.loc[g.id.isin(others_ids), 'dup'] = answer
        return g
    
    DescId = collections.namedtuple('DescId', ('desc', 'id'))
    df['dup'] = False
    
    df = df.groupby('group').apply(identifier)
    return df[df.dup].index

def drop_type2_dups(df):
    """Drops Type 2 duplicates.
    
    A Type 2 duplicate is one of two txns with identical user ids, txn ids,
    account ids, dates, and amounts, as well as similar txn descriptions, 
    where "similar" means that each word in the description of one txn appears
    in the description of the other.
    """    
    potential_dups = _get_potential_type2_dups(df)
    dups_idx = _identify_type2_dups(potential_dups)
    return df.drop(dups_idx)

def counter(df):
    print(df.shape)
    return df

In [31]:
k1 = clean.pipe(drop_type2_dups)
all(k0.index == k1.index)
print(k1.shape)
k1.head(3)

(121093, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
0,688261,2012-01-03,777,400.0,mdbremoved,,transfers,tsransfer,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2017-11-13,True,364.220001,non merchant mbl,transfers,other account,other account,u,201201,-1451.075562,24319.220881,False
1,688264,2012-01-03,777,10.27,9572 30dec 11 mcdonalds restaurant winwick road gb,mcdonalds,spend,services,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2015-03-19,True,364.220001,mcdonalds,dining and drinking,,dining and drinking,u,201201,-1451.075562,24319.220881,False
2,688263,2012-01-03,777,6.68,9572 31dec 11 tesco stores 3345 warrington gb,tesco,spend,household,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2017-08-15,True,364.220001,tesco supermarket,"food, groceries, household",,supermarket,u,201201,-1451.075562,24319.220881,False


In [5]:
clean = clean_desc(df).pipe(drop_type1_dups)

In [11]:
k0 = clean.pipe(drop_dup2_old)
print(k0.shape)
k0.head(3)

(121093, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,savings
0,688261,2012-01-03,777,400.0,mdbremoved,,transfers,tsransfer,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2017-11-13,True,364.220001,non merchant mbl,transfers,other account,other account,u,201201,-1451.075562,24319.220881,False
1,688264,2012-01-03,777,10.27,9572 30dec 11 mcdonalds restaurant winwick road gb,mcdonalds,spend,services,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2015-03-19,True,364.220001,mcdonalds,dining and drinking,,dining and drinking,u,201201,-1451.075562,24319.220881,False
2,688263,2012-01-03,777,6.68,9572 31dec 11 tesco stores 3345 warrington gb,tesco,spend,household,False,wa1 4,2011-07-20,20k to 30k,1969.0,2011-07-20,262916,2020-07-21 20:32:00,natwest bank,current,2014-07-18,2017-08-15,True,364.220001,tesco supermarket,"food, groceries, household",,supermarket,u,201201,-1451.075562,24319.220881,False


In [151]:
# compare pre_post clean desc

k = df.iloc[785793].to_frame().T
display(k.desc.values[0])
clean_desc(k).desc.values[0]

'midgleys cd 5714 deb'

'midgleys cd 5714 deb'

Features:
- In groups larger than two, if others are related and shortest isn't, then we're unable to identify others as dups. (e.g.: `df.iloc[[1267567, 1267576, 1267577]]`)
- If shorter is mdbremoved only, then conservatively classify as non-dup
- Number matched to wrong equivalent [559240, 559242] -> each element in other should only match once
- Groups with daily od charges, shortest without date not identified as dup

Limitations:
- If group contains two groups of duplicates, they are not identified




## Improvements

### Clean

Original

In [20]:
df = aws.read_parquet('s3://3di-project-entropy/entropy_X77.parquet')

In [21]:
def distr(x):
    pcts = [.01, .05, .1, .25, .50, .75, .90, .95, .99]
    return x.describe(percentiles=pcts).round(2)

def duplicates_sample(df, col_subset, n=100, seed=2312):
    """Draws sample of size n of duplicate txns as defined by col_subset."""
    dups = df[df.duplicated(subset=col_subset, keep=False)].copy()
    dups['group'] = dups.groupby(col_subset).ngroup()
    unique_groups = np.unique(dups.group)
    rng = np.random.default_rng(seed=seed)
    sample = rng.choice(unique_groups, size=n)
    return dups[dups.group.isin(sample)]

## Case studies

Below three case studies of duplicates

In [None]:
dh.user_date_data(df, 35177, '1 Jan 2020')

In [None]:
dh.user_date_data(df, 362977, '1 Jan 2020')

In [None]:
dh.user_date_data(df, 467877, '1 Jan 2020')

## Type 1 duplicates

### Definition
- `['user_id', 'date', 'amount', 'account_id', 'desc']` are identical.
 
- This includes transactions where desc for both is `<mdbremoved>`, where we assume that they mask the same transaction desctiption.

- Reasons for false positives (FP): user makes two identical transactions on the same day (or on subsequent days for txns that appear with a delay). Plausible cases are coffee and betting shop txns. However, inspection suggests that the vast majority of cases are genuine duplicates, as they are txns that are unlikely to result from multiple purchases on the same day.

In [22]:
col_subset = ['user_id', 'date', 'amount', 'account_id', 'desc']
dup_var = 'dup1'

df[dup_var] = df.duplicated(subset=col_subset)

### Prevalence and value

How prevalent are duplicates?

In [23]:
n_df = len(df)
n_dups = len(df[df[dup_var]])
n_users_dups = df[df[dup_var]].user_id.nunique()
n_users_df = df.user_id.nunique()
txt = 'About {:.1%} of transactions across {:.0%} of users are potential dups.'
print(txt.format(n_dups / n_df, n_users_dups / n_users_df))

About 1.7% of transactions across 97% of users are potential dups.


Gross value of duplicated txns

In [24]:
gross_value = df[df[dup_var]].set_index('user_id').amount.abs().groupby('user_id').sum()
distr(gross_value)

count       415.00
mean       4459.53
std       14957.93
min           1.00
1%            4.54
5%           20.46
10%          61.97
25%         237.31
50%         830.10
75%        2647.96
90%        8980.22
95%       16434.12
99%       59013.14
max      183754.34
Name: amount, dtype: float64

Most frequent txns description

In [25]:
df[df[dup_var]].desc.value_counts(dropna=False)[:10]

<mdbremoved>                       1962
<mdbremoved>                        516
<mdbremoved> ft                     359
b365 moto                           263
paypal payment                      195
tfl travel charge tfl.gov.uk/cp     167
www.skybet.com cd 9317              165
<mdbremoved> so                     157
betfair.-purchase                   146
<mdbremoved> - s/o                  143
Name: desc, dtype: int64

Most frequent auto tag

In [26]:
df[df[dup_var]].tag_auto.value_counts(dropna=False)[:10]

NaN                         6104
transfers                   3260
gambling                    2273
enjoyment                   1617
public transport            1132
lunch or snacks             1019
bank charges                 862
entertainment, tv, media     556
cash                         520
dining or going out          507
Name: tag_auto, dtype: int64

Proportion of txns per auto tag that are duplicated

In [27]:
txns_per_tag_overall = df.tag_auto.value_counts(dropna=False)
txns_per_tag_duplicated = df[df[dup_var]].tag_auto.value_counts(dropna=False) 
p_dup_per_tag = (txns_per_tag_duplicated / txns_per_tag_overall)
p_dup_per_tag.sort_values(ascending=False)[:10]

investment - other              0.227723
gambling                        0.162822
mobile app                      0.147576
isa                             0.090024
tradesmen fees                  0.062500
flights                         0.050548
parking                         0.046441
payment protection insurance    0.044776
paypal account                  0.044444
bills                           0.044291
Name: tag_auto, dtype: float64

### Inspect dups

In [28]:
duplicates_sample(df, col_subset, n=2, seed=None).desc

275216       <mdbremoved>
275217       <mdbremoved>
726133    <mdbremoved> so
726134    <mdbremoved> so
Name: desc, dtype: category
Categories (2359486, object): [' <mdbremoved> ', ' <mdbremoved>  & co llp - d/d', ' <mdbremoved>  & co store ltd', ' <mdbremoved>  & company cd 6426', ..., 'ób ísafirdi', 'öl & vin magasinet stockholm', 'ùappleseed books and p guildford', 'úri cipö - kaptafa']

## Type 2 dups

### Definition

- `['user_id', 'date', 'amount', 'account_id']` are identical, one `desc` is subset of the other.

Remove type 1 dups

In [29]:
df = df.drop_duplicates(subset=col_subset)

In [30]:
col_subset = ['user_id', 'date', 'amount', 'account_id']
dup_var = 'dup2'

df[dup_var] = df.duplicated(subset=col_subset)

### Prevalence and value

How prevalent are duplicates?

In [31]:
n_df = len(df)
n_dups = len(df[df[dup_var]])
n_users_dups = df[df[dup_var]].user_id.nunique()
n_users_df = df.user_id.nunique()
txt = 'About {:.1%} of transactions across {:.0%} of users are potential dups.'
print(txt.format(n_dups / n_df, n_users_dups / n_users_df))

About 1.9% of transactions across 99% of users are potential dups.


Gross value of duplicated txns

In [32]:
gross_value = df[df[dup_var]].set_index('user_id').amount.abs().groupby('user_id').sum()
distr(gross_value)

count       424.00
mean       2497.45
std        8311.57
min           3.00
1%           11.08
5%           48.28
10%         104.04
25%         298.47
50%         880.35
75%        2097.92
90%        4584.54
95%        6842.71
99%       25811.31
max      106598.39
Name: amount, dtype: float64

Most frequent txns description

In [33]:
df[df[dup_var]].desc.str[:12].value_counts(dropna=False)[:10]

<mdbremoved>    3523
daily od fee    1894
int'l xxxxxx     941
card payment     463
tfl travel c     336
direct debit     319
call ref.no.     308
tfl.gov.uk/c     288
contactless      281
tesco stores     275
Name: desc, dtype: int64

Most frequent auto tag