## Corruption demo

In [1]:
import pandas as pd
import numpy as np
import random
import re
import string
from splink.datasets import splink_datasets
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl

### Define functions

The first three functions are used to alter spelling by one character. 

The first is a replacement function, and the second two generate the replacement based on whether a character or number should be replaced. The variable you wish to corrupt will determine which of these you use (e.g. `corrupt_string_char` for a name, or `corrupt_string_num` for a DOB).

In [2]:
def replace_character(candidate_filter, replacement_possibilities, string_to_change):
    chars = re.findall(candidate_filter, string_to_change)
    old_char = random.choice(chars)
    possible_replacements = [ch for ch in replacement_possibilities if ch != old_char]
    new_char = random.choice(possible_replacements)
    new_string = re.sub(old_char, new_char, string_to_change, 1)
    return new_string

def corrupt_string_char(string_to_change):
    return replace_character(r'[a-z]', string.ascii_lowercase, string_to_change)

def corrupt_string_num(string_to_change):
    return replace_character(r'\d', [str(i) for i in range(10)], string_to_change)

This function corrupts address using a list of addresses and a specified corruption type. The type of corruption dictates the extent of address manipulation (for example, a full address change through a totally new postcode, or a partial change by altering the latter half of the postcode).

In [3]:
def corrupt_address(address_list, corruption_type='full_change'):
    def corrupt(old_address):
        optional_address = [address for address in address_list if address != old_address]

        if corruption_type == 'full_change':
            new_address = random.choice(optional_address)

        elif corruption_type == 'partial_change':
            start = old_address.split(' ')[0]
            end = random.choice(optional_address).split(' ')[1]
            new_address = start + ' ' + end
        
        return new_address

    return corrupt

This function corrupts name, and can be used for either first name or surname. It can either generate an entire new name, or return a double-barrel version of the current name.

In [4]:
def corrupt_name(name_list, corruption_type='new_name'):
    def corrupt(old_name):
        optional_names = [name for name in name_list if name != old_name]

        if corruption_type == 'new_name':
            new_name = random.choice(optional_names)

        elif corruption_type == 'double_barrel':
            new_name = old_name + '-' + random.choice(optional_names)

        return new_name

    return corrupt

In [5]:
def corrupt_dob(dob_list, corruption_type='full_change'):
    def corrupt(old_dob):
        optional_dobs = [dob for dob in dob_list if dob != old_dob]

        if corruption_type == 'full_change':
            new_dob = random.choice(optional_dobs)

        #elif corruption_type == 'day_change':
            
        return new_dob
    
    return corrupt

In [6]:
def corrupt_occupation(occupation_list):
    def corrupt(old_occupation):
        optional_occupations = [occupation for occupation in occupation_list if occupation != old_occupation]
        new_occupation = random.choice(optional_occupations) 
        return new_occupation
    
    return corrupt

In [7]:
def corrupt_row(row, field_corruptions):
    corrupted = row.copy()

    for field, corruptions in field_corruptions.items():
        for corruption in corruptions:
            corrupted[field] = corruption(corrupted[field])
    
    return corrupted

In [8]:
def corrupt_df(df, field_corruptions):
    return df.apply(lambda row: corrupt_row(row, field_corruptions), axis=1)

In [33]:
def add_match_labels(df, match_status):

    df_new = df.copy()

    if match_status == "matches":

        df_new['id_l_prefix'] = df_new['unique_id_l'].str.split('-').str[0]
        df_new['id_r_prefix'] = df_new['unique_id_r'].str.split('-').str[0]
        df_new['true_match'] = (df_new['id_l_prefix'] == df_new['id_r_prefix']).astype(int)
        df_new = df_new.drop(columns=['id_l_prefix', 'id_r_prefix'])

    elif match_status == "non-matches":

        df_new['true_match'] = 0

    df_new['predict_high_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.999 else 1)
    df_new['predict_med_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.99 else 1)
    df_new['predict_low_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.95 else 1)

    return df_new

In [34]:
def predict_scores(df, model, match_status):

    linker = DuckDBLinker(df, model)
    df_predict = linker.predict().as_pandas_dataframe()
    df_predict = add_match_labels(df_predict, match_status)

    return df_predict

### Create the base data

In [11]:
# Load the data
df = splink_datasets.historical_50k

# Select one example per individual
person_df = df.groupby('cluster').first().reset_index()

# Drop any rows with NA values (we do not want these in our perfect data)
person_df = person_df.dropna()

# Filter out rows where name contains a number or a full stop
pattern = r'[.\d\-,]'
person_df = person_df[~person_df['full_name'].str.contains(pattern)]

# Take a random sample of 20 people 
base_df = person_df.sample(10, random_state=7)
base_df

Unnamed: 0,cluster,unique_id,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
1803,Q2551919,Q2551919-1,graham wallas,graham wallas,graham,wallas,1858-05-31,sunderland,sr2 7ta,male,sociologist
3918,Q6253670,Q6253670-1,john quick,john quick,john,quick,1852-04-14,st ives,tr26 1jy,male,journalist
436,Q15971359,Q15971359-1,walter lindsay,walter lindsay,walter,lindsay,1855-05-15,west berkshire,rg14 5rh,male,military leader
2316,Q35610,Q35610-1,arthur conan doyle,arthur doyle,arthur,doyle,1859-05-22,edinburgh,eh3 5hw,male,screenwriter
1863,Q26732214,Q26732214-1,frederic barnes,frederic barnes,frederic,barnes,1856-11-16,telford and wrekin,tf2 9sx,male,politician
3164,Q5401176,Q5401176-1,esther elizabeth velkiers,esther velkiers,esther,velkiers,1640-01-01,geneva,bb1 1sg,female,composer
4968,Q86260965,Q86260965-1,william albert rouch,william rouch,william,rouch,1862-01-01,barnes,sw15 1qs,male,photographer
1630,Q21557354,Q21557354-1,lucy mary silcox,lucy silcox,lucy,silcox,1862-07-11,warminster,ba12 0ay,female,head teacher
2498,Q43136267,Q43136267-1,frank saltfleet,frank saltfleet,frank,saltfleet,1860-01-01,telford and wrekin,tf3 2ng,male,artist
1509,Q21458727,Q21458727-1,arthur hopkins,arthur hopkins,arthur,hopkins,1848-12-30,london,wc2a 3jx,male,painter


In [12]:
# Duplicate the DataFrame
dupe_base_df = base_df.copy()
# Change unique_ids 
dupe_base_df['unique_id'] = dupe_base_df['unique_id'].str.replace('-1', '-2')

In [13]:
list_of_surnames = person_df['surname']
list_of_addresses = person_df['postcode_fake']
list_of_first_names = person_df['first_name']
list_of_occupations = person_df['occupation']
list_of_dobs = person_df['dob']

In [26]:
# Create corruption dictionaries 
# Full surname change
surname_corruption = {"surname": [corrupt_name(list_of_surnames)]}
# Partial address change
address_corruption = {"postcode_fake": [corrupt_address(list_of_addresses, 'partial_change')]}
# Full surname change and partial address change
surname_and_address_corruption = {**surname_corruption, **address_corruption}

# First name corruption 
first_name_corruption = {"first_name": [corrupt_name(list_of_first_names)]}
# DOB corruption
dob_corruption = {"dob": [corrupt_dob(list_of_dobs)]}
# Occupation corruption
occupation_corruption = {"occupation": [corrupt_occupation(list_of_occupations)]}

# First name and DOB corruption 
first_name_and_dob_corruption = {**first_name_corruption, **dob_corruption}
# First name and occupation corruption 
first_name_and_occupation_corruption = {**first_name_corruption, **occupation_corruption}

first_name_and_dob_and_occupation_corruption = {**first_name_corruption, **dob_corruption, **occupation_corruption}

In [13]:
# corruptions = [
#     surname_corruption,
#     address_corruption,
#     first_name_sp_corruption,
#     dob_sp_corruption,
#     surname_and_address_corruption,
#     surname_and_first_name_sp_corruption,
#     address_and_dob_sp_corruption,
#     surname_and_first_name_sp_corruption
# ]

# corrupted_dfs = []
# for corruption in corruptions:
#     corrupted_dfs.append(corrupt_df(dupe_base_df, corruption))

In [28]:
df_perfect = pd.concat([base_df, dupe_base_df], ignore_index=True)

df_surname_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, surname_corruption)], ignore_index=True)
df_address_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, address_corruption)], ignore_index=True)

df_surname_and_address_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, surname_and_address_corruption)], ignore_index=True)

df_first_name_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, first_name_corruption)], ignore_index=True)
df_dob_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, dob_corruption)], ignore_index=True)
df_occupation_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, occupation_corruption)], ignore_index=True)

df_first_name_and_dob_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, first_name_and_dob_corruption)], ignore_index=True)
df_first_name_and_occupation_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, first_name_and_occupation_corruption)], ignore_index=True)
df_first_name_and_dob_and_occupation_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, first_name_and_dob_and_occupation_corruption)], ignore_index=True)

### Predict on the test sets

In [43]:
df_perfect_predict = predict_scores(df_perfect, "model_h50k.json", "matches")

df_surname_corrupt_predict = predict_scores(df_surname_corrupt, "model_h50k.json", "matches")
df_address_corrupt_predict = predict_scores(df_address_corrupt, "model_h50k.json", "matches")

df_surname_and_address_corrupt_predict = predict_scores(df_surname_and_address_corrupt, "model_h50k.json", "matches")

df_first_name_corrupt_predict = predict_scores(df_first_name_corrupt, "model_h50k.json", "non-matches")
df_dob_corrupt_predict = predict_scores(df_dob_corrupt, "model_h50k.json", "non-matches")
df_occupation_corrupt_predict = predict_scores(df_occupation_corrupt, "model_h50k.json", "non-matches")

df_first_name_and_dob_corrupt_predict = predict_scores(df_first_name_and_dob_corrupt, "model_h50k.json", "non-matches")
df_first_name_and_occupation_corrupt_predict = predict_scores(df_first_name_and_occupation_corrupt, "model_h50k.json", "non-matches")
df_first_name_and_dob_and_occupation_corrupt_predict = predict_scores(df_first_name_and_dob_and_occupation_corrupt, "model_h50k.json", "non-matches")

In [38]:
print('Perfect data:')
display(df_perfect_predict.iloc[:, -4:])
print('Corrupted surname:')
display(df_surname_corrupt_predict.iloc[:, -4:])
print('Corrupted address:')
display(df_address_corrupt_predict.iloc[:, -4:])
print('Corrupted surname and address:')
display(df_surname_and_address_corrupt_predict.iloc[:, -4:])

Perfect data:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted surname:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted address:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted surname and address:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,0,0,1
1,1,0,0,1
2,1,0,0,1
3,1,0,0,1
4,1,0,0,1
5,1,0,0,1
6,1,0,0,1
7,1,0,0,1
8,1,0,0,1
9,1,0,0,1


In [40]:
print('Corrupted first name:')
display(df_first_name_corrupt_predict.iloc[:, -4:])
print('Corrupted dob:')
display(df_dob_corrupt_predict.iloc[:, -4:])
print('Corrupted occupation:')
display(df_occupation_corrupt_predict.iloc[:, -4:])

Corrupted first name:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,0,1,1,1
1,0,1,1,1
2,0,1,1,1
3,0,1,1,1
4,0,1,1,1
5,0,1,1,1
6,0,1,1,1
7,0,1,1,1
8,0,1,1,1
9,0,1,1,1


Corrupted dob:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
5,0,0,0,1
6,0,0,0,1
7,0,0,1,1
8,0,0,1,1
9,0,0,1,1


Corrupted occupation:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,0,1,1,1
1,0,1,1,1
2,0,1,1,1
3,0,1,1,1
4,0,1,1,1
5,0,1,1,1
6,0,1,1,1
7,0,1,1,1
8,0,1,1,1
9,0,1,1,1


In [44]:
print('Corrupted first name and dob:')
display(df_first_name_and_dob_corrupt_predict.iloc[:, -4:])
print('Corrupted first name and occupation:')
display(df_first_name_and_occupation_corrupt_predict.iloc[:, -4:])
print('Corrupted first name and dob and occupation:')
display(df_first_name_and_dob_and_occupation_corrupt_predict.iloc[:, -4:])

Corrupted first name and dob:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match


Corrupted first name and occupation:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,0,0,1,1
1,0,0,1,1
2,0,0,1,1
3,0,0,1,1
4,0,0,1,1
5,0,0,1,1
6,0,0,1,1
7,0,0,1,1
8,0,0,1,1
9,0,0,1,1


Corrupted first name and dob and occupation:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
