In [1]:
import pandas as pd
import numpy as np
import random
import re
import string
from splink.datasets import splink_datasets
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl

In [2]:
def replace_character(candidate_filter, replacement_possibilities, string_to_change):
    chars = re.findall(candidate_filter, string_to_change)
    old_char = random.choice(chars)
    possible_replacements = [ch for ch in replacement_possibilities if ch != old_char]
    new_char = random.choice(possible_replacements)
    new_string = re.sub(old_char, new_char, string_to_change, 1)
    return new_string

def corrupt_string_char(string_to_change):
    return replace_character(r'[a-z]', string.ascii_lowercase, string_to_change)

def corrupt_string_num(string_to_change):
    return replace_character(r'\d', [str(i) for i in range(10)], string_to_change)

In [3]:
def corrupt_address(address_list, corruption_type='full_change'):
    def corrupt(old_address):
        optional_address = [address for address in address_list if address != old_address]

        if corruption_type == 'full_change':
            new_address = random.choice(optional_address)

        elif corruption_type == 'partial_change':
            start = old_address.split(' ')[0]
            end = random.choice(optional_address).split(' ')[1]
            new_address = start + ' ' + end
        
        return new_address

    return corrupt

In [4]:
def corrupt_name(name_list, corruption_type='new_name'):
    def corrupt(old_name):
        optional_names = [name for name in name_list if name != old_name]

        if corruption_type == 'new_name':
            new_name = random.choice(optional_names)

        elif corruption_type == 'double_barrel':
            new_name = old_name + '-' + random.choice(optional_names)

        return new_name

    return corrupt

In [5]:
def corrupt_row(row, field_corruptions):
    corrupted = row.copy()

    for field, corruptions in field_corruptions.items():
        for corruption in corruptions:
            corrupted[field] = corruption(corrupted[field])
    
    return corrupted

In [6]:
def corrupt_df(df, field_corruptions):
    return df.apply(lambda row: corrupt_row(row, field_corruptions), axis=1)

In [7]:
def add_match_labels(df):

    df_new = df.copy()

    df_new['id_l_prefix'] = df_new['unique_id_l'].str.split('-').str[0]
    df_new['id_r_prefix'] = df_new['unique_id_r'].str.split('-').str[0]
    df_new['true_match'] = (df_new['id_l_prefix'] == df_new['id_r_prefix']).astype(int)
    df_new = df_new.drop(columns=['id_l_prefix', 'id_r_prefix'])

    df_new['predict_high_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.999 else 1)
    df_new['predict_med_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.99 else 1)
    df_new['predict_low_match'] = df_new['match_probability'].apply(lambda x: 0 if x < 0.95 else 1)

    return df_new

In [8]:
def predict_scores(df, model):

    linker = DuckDBLinker(df, model)
    df_predict = linker.predict().as_pandas_dataframe()
    df_predict = add_match_labels(df_predict)

    return df_predict

### Create the base data

In [9]:
# Load the data
df = splink_datasets.historical_50k

# Select one example per individual
person_df = df.groupby('cluster').first().reset_index()

# Drop any rows with NA values (we do not want these in our perfect data)
person_df = person_df.dropna()

# Filter out rows where name contains a number or a full stop
pattern = r'[.\d\-,]'
person_df = person_df[~person_df['full_name'].str.contains(pattern)]

# Take a random sample of 20 people 
base_df = person_df.sample(10, random_state=7)
base_df


Unnamed: 0,cluster,unique_id,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
1803,Q2551919,Q2551919-1,graham wallas,graham wallas,graham,wallas,1858-05-31,sunderland,sr2 7ta,male,sociologist
3918,Q6253670,Q6253670-1,john quick,john quick,john,quick,1852-04-14,st ives,tr26 1jy,male,journalist
436,Q15971359,Q15971359-1,walter lindsay,walter lindsay,walter,lindsay,1855-05-15,west berkshire,rg14 5rh,male,military leader
2316,Q35610,Q35610-1,arthur conan doyle,arthur doyle,arthur,doyle,1859-05-22,edinburgh,eh3 5hw,male,screenwriter
1863,Q26732214,Q26732214-1,frederic barnes,frederic barnes,frederic,barnes,1856-11-16,telford and wrekin,tf2 9sx,male,politician
3164,Q5401176,Q5401176-1,esther elizabeth velkiers,esther velkiers,esther,velkiers,1640-01-01,geneva,bb1 1sg,female,composer
4968,Q86260965,Q86260965-1,william albert rouch,william rouch,william,rouch,1862-01-01,barnes,sw15 1qs,male,photographer
1630,Q21557354,Q21557354-1,lucy mary silcox,lucy silcox,lucy,silcox,1862-07-11,warminster,ba12 0ay,female,head teacher
2498,Q43136267,Q43136267-1,frank saltfleet,frank saltfleet,frank,saltfleet,1860-01-01,telford and wrekin,tf3 2ng,male,artist
1509,Q21458727,Q21458727-1,arthur hopkins,arthur hopkins,arthur,hopkins,1848-12-30,london,wc2a 3jx,male,painter


In [10]:
# Duplicate the DataFrame
dupe_base_df = base_df.copy()
# Change unique_ids 
dupe_base_df['unique_id'] = dupe_base_df['unique_id'].str.replace('-1', '-2')

In [11]:
list_of_names = person_df['surname']
list_of_addresses = person_df['postcode_fake']

In [12]:
surname_corruption = {"surname": [corrupt_name(list_of_names)]}
address_corruption = {"postcode_fake": [corrupt_address(list_of_addresses, 'partial_change')]}
first_name_sp_corruption = {"first_name": [corrupt_string_char]}
dob_sp_corruption = {"dob": [corrupt_string_num]}
surname_and_address_corruption = {**surname_corruption, **address_corruption}
surname_and_first_name_sp_corruption = {**surname_corruption, **first_name_sp_corruption}
address_and_dob_sp_corruption = {**address_corruption, **dob_sp_corruption}

In [13]:
# corruptions = [
#     surname_corruption,
#     address_corruption,
#     first_name_sp_corruption,
#     dob_sp_corruption,
#     surname_and_address_corruption,
#     surname_and_first_name_sp_corruption,
#     address_and_dob_sp_corruption,
#     surname_and_first_name_sp_corruption
# ]

# corrupted_dfs = []
# for corruption in corruptions:
#     corrupted_dfs.append(corrupt_df(dupe_base_df, corruption))

In [14]:
df_perfect = pd.concat([base_df, dupe_base_df], ignore_index=True)
df_surname_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, surname_corruption)], ignore_index=True)
df_address_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, address_corruption)], ignore_index=True)
df_first_name_sp_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, first_name_sp_corruption)], ignore_index=True)
df_dob_sp_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, dob_sp_corruption)], ignore_index=True)

df_surname_and_address_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, surname_and_address_corruption)], ignore_index=True)
df_surname_and_first_name_sp_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, surname_and_first_name_sp_corruption)], ignore_index=True)
df_address_and_dob_sp_corrupt = pd.concat([base_df, corrupt_df(dupe_base_df, address_and_dob_sp_corruption)], ignore_index=True)

In [42]:
df_perfect

Unnamed: 0,cluster,unique_id,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
0,Q2551919,Q2551919-1,graham wallas,graham wallas,graham,wallas,1858-05-31,sunderland,sr2 7ta,male,sociologist
1,Q6253670,Q6253670-1,john quick,john quick,john,quick,1852-04-14,st ives,tr26 1jy,male,journalist
2,Q15971359,Q15971359-1,walter lindsay,walter lindsay,walter,lindsay,1855-05-15,west berkshire,rg14 5rh,male,military leader
3,Q35610,Q35610-1,arthur conan doyle,arthur doyle,arthur,doyle,1859-05-22,edinburgh,eh3 5hw,male,screenwriter
4,Q26732214,Q26732214-1,frederic barnes,frederic barnes,frederic,barnes,1856-11-16,telford and wrekin,tf2 9sx,male,politician
5,Q5401176,Q5401176-1,esther elizabeth velkiers,esther velkiers,esther,velkiers,1640-01-01,geneva,bb1 1sg,female,composer
6,Q86260965,Q86260965-1,william albert rouch,william rouch,william,rouch,1862-01-01,barnes,sw15 1qs,male,photographer
7,Q21557354,Q21557354-1,lucy mary silcox,lucy silcox,lucy,silcox,1862-07-11,warminster,ba12 0ay,female,head teacher
8,Q43136267,Q43136267-1,frank saltfleet,frank saltfleet,frank,saltfleet,1860-01-01,telford and wrekin,tf3 2ng,male,artist
9,Q21458727,Q21458727-1,arthur hopkins,arthur hopkins,arthur,hopkins,1848-12-30,london,wc2a 3jx,male,painter


In [43]:
df_surname_and_address_corrupt

Unnamed: 0,cluster,unique_id,full_name,first_and_surname,first_name,surname,dob,birth_place,postcode_fake,gender,occupation
0,Q2551919,Q2551919-1,graham wallas,graham wallas,graham,wallas,1858-05-31,sunderland,sr2 7ta,male,sociologist
1,Q6253670,Q6253670-1,john quick,john quick,john,quick,1852-04-14,st ives,tr26 1jy,male,journalist
2,Q15971359,Q15971359-1,walter lindsay,walter lindsay,walter,lindsay,1855-05-15,west berkshire,rg14 5rh,male,military leader
3,Q35610,Q35610-1,arthur conan doyle,arthur doyle,arthur,doyle,1859-05-22,edinburgh,eh3 5hw,male,screenwriter
4,Q26732214,Q26732214-1,frederic barnes,frederic barnes,frederic,barnes,1856-11-16,telford and wrekin,tf2 9sx,male,politician
5,Q5401176,Q5401176-1,esther elizabeth velkiers,esther velkiers,esther,velkiers,1640-01-01,geneva,bb1 1sg,female,composer
6,Q86260965,Q86260965-1,william albert rouch,william rouch,william,rouch,1862-01-01,barnes,sw15 1qs,male,photographer
7,Q21557354,Q21557354-1,lucy mary silcox,lucy silcox,lucy,silcox,1862-07-11,warminster,ba12 0ay,female,head teacher
8,Q43136267,Q43136267-1,frank saltfleet,frank saltfleet,frank,saltfleet,1860-01-01,telford and wrekin,tf3 2ng,male,artist
9,Q21458727,Q21458727-1,arthur hopkins,arthur hopkins,arthur,hopkins,1848-12-30,london,wc2a 3jx,male,painter


### Predict on the test sets

In [37]:
df_perfect_predict = predict_scores(df_perfect, "model.json")
df_surname_corrupt_predict = predict_scores(df_surname_corrupt, "model.json")
df_address_corrupt_predict = predict_scores(df_address_corrupt, "model.json")
df_first_name_sp_corrupt_predict = predict_scores(df_first_name_sp_corrupt, "model.json")
df_dob_sp_corrupt_predict = predict_scores(df_dob_sp_corrupt, "model.json")

df_surname_and_address_corrupt_predict = predict_scores(df_surname_and_address_corrupt, "model.json")
df_surname_and_first_name_sp_corrupt_predict = predict_scores(df_surname_and_first_name_sp_corrupt, "model.json")
df_address_and_dob_sp_corrupt_predict = predict_scores(df_address_and_dob_sp_corrupt, "model.json")

In [44]:
print('Perfect data:')
display(df_perfect_predict.iloc[:, -4:])
print('Corrupted surname:')
display(df_surname_corrupt_predict.iloc[:, -4:])
print('Corrupted address:')
display(df_address_corrupt_predict.iloc[:, -4:])
print('Corrupted spelling (first name):')
display(df_first_name_sp_corrupt_predict.iloc[:, -4:])
print('Corrupted spelling (DOB):')
display(df_dob_sp_corrupt_predict.iloc[:, -4:])

Perfect data:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted surname:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted address:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted spelling (first name):


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


Corrupted spelling (DOB):


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,0,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
5,1,1,1,1
6,1,1,1,1
7,1,1,1,1
8,1,1,1,1
9,1,1,1,1


In [45]:
print('Corrupted surname and address:')
display(df_surname_and_address_corrupt_predict.iloc[:, -4:])
print('Corrupted surname and spelling (first name):')
display(df_surname_and_first_name_sp_corrupt_predict.iloc[:, -4:])
print('Corrupted address and spelling (DOB):')
display(df_address_and_dob_sp_corrupt_predict.iloc[:, -4:])

Corrupted surname and address:


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,0,0,1
1,1,0,0,1
2,1,0,0,1
3,1,0,0,1
4,1,0,0,1
5,1,0,0,1
6,1,0,0,1
7,1,0,0,1
8,1,0,1,1
9,1,0,1,1


Corrupted surname and spelling (first name):


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match


Corrupted address and spelling (DOB):


Unnamed: 0,true_match,predict_high_match,predict_med_match,predict_low_match
0,1,0,0,0
1,1,0,0,1
2,1,0,1,1
3,1,0,1,1
4,1,0,1,1
5,1,0,1,1
6,1,0,1,1
7,1,0,1,1
8,1,1,1,1
9,1,1,1,1
