# Testing Privacy Preserving Record Linkage using Anonlink

Note: none of the examples here contain real PII. Where names and addresses below aren't obviously fake, they were crafted to create realistic scenarios or picked from the most common names in the US.

In [1]:
import io
import glob
from pathlib import Path

import pandas as pd

import anonlink
from clkhash.clk import generate_clk_from_csv
from clkhash import schema

In [2]:
# don't display more than 4 decimal places on scores
pd.options.display.float_format = "{:,.4f}".format
pd.set_option('precision', 4)

# define function to render the results in a nice way

def render(a, b, scores, highlight_delta=False, threshold=None):
    # delta_cols = [col for col in dfa.columns if dfa.iloc[0][col] != dfb.iloc[0][col]]

    summary_df = pd.DataFrame()

    # columns that are ignored in the schema but required,
    # so we should ignore them in the results as well
    ignore_cols = ['record_id', 'parent_given_name', 'parent_family_name']

    # shorten the column names for space reasons
    col_name_map = { 
        'phone_number': 'phone', 
        'household_street_address': 'address',
        'household_zip': 'zip'
    }

    delta_cols = []
    for col in a.columns:
        if col in ignore_cols:
            continue

        new_col_name = col_name_map[col] if col in col_name_map else col

        if a.iloc[0][col] == b.iloc[0][col]:
            # the two values are identical, so only add it once
            summary_df[new_col_name] = a[col]
        else:
            # add a column for each side
            summary_df[f"{new_col_name}_L"] = a[col]
            summary_df[f"{new_col_name}_R"] = b[col]
            delta_cols.extend([f"{new_col_name}_L", f"{new_col_name}_R"])

    # then add the schema results
    for schema_name, score in scores.items():
        summary_df[schema_name] = [score]
        
    style = summary_df.style
    
    # add a border between the data values and the scores
    # (ie, to the left of the first score)
    style = style.applymap(lambda x: 'border-left: 2px solid black;', subset=[next(iter(scores))])
    
    # add a yellow highlight on delta columns
    if highlight_delta:
        style = style.applymap(lambda x: 'background-color: yellow;', subset=delta_cols)

    # add a green highlight on scores > threshold
    if threshold:
        style = style.applymap(lambda x: 'background-color: #77FF99;' if x > threshold else '', subset=list(results.keys()))

    return style


In [3]:
linkage_schemas = {}
for schema_file in glob.glob('schemas/*.json'):
    schema_name = Path(schema_file).stem
    with open(schema_file, 'r') as f:
        linkage_schemas[schema_name] = schema.from_json_file(f)
        
linkage_schemas

{'name-sex-dob-phone': <Schema (v3): 11 fields>,
 'name-sex-dob-zip': <Schema (v3): 11 fields>,
 'name-sex-dob-phone-zip': <Schema (v3): 11 fields>,
 'name-sex-dob-parents': <Schema (v3): 11 fields>,
 'name-sex-dob-addr': <Schema (v3): 11 fields>,
 'name-sex-dob-addr-zip': <Schema (v3): 11 fields>}

In [4]:
required_fields = {
    'record_id': '',
    'given_name': '',
    'family_name': '',
    'DOB': '',
    'sex': '',
    'phone_number': '',
    'household_street_address': '',
    'household_zip': ''
}

def compare(a, b):
    dict_a = {**required_fields, **a}
    dict_b = {**required_fields, **b}
    
    dfa = pd.DataFrame([dict_a])
    dfb = pd.DataFrame([dict_b])

    a_csv = io.StringIO()
    dfa.to_csv(a_csv, index=False)
    b_csv = io.StringIO()
    dfb.to_csv(b_csv, index=False)

    secret = "password1234"

    results = {}
    for schema_name, linkage_schema in linkage_schemas.items():
        # reset the pointer on the 2 CSVs to "beginning of the file"
        a_csv.seek(0)
        b_csv.seek(0)
        clks_a = generate_clk_from_csv(a_csv, secret, linkage_schema, progress_bar=False)
        clks_b = generate_clk_from_csv(b_csv, secret, linkage_schema, progress_bar=False)

        results_candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
                [clks_a, clks_b],
                anonlink.similarities.dice_coefficient,
                0  # threshold is 0 so that a candidate pair is always returned
        )

        results[schema_name] = results_candidate_pairs[0][0]

    return render(dfa, dfb, results, highlight_delta=True, threshold=None)

# Examples

In [5]:
# define a couple samples
JOHN_DOE = {
    'given_name': 'John',
    'family_name': 'Doe',
    'DOB': '2021-01-24',
    'sex': 'M',
    'phone_number': '1234567890',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
    'parent_email': 'mom@example.com'
}

MIN_RECORD = {
    'given_name': 'Minuet',
    'family_name': 'Riker',
    'DOB': '2006-06-06',
    'sex': 'F',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

In [6]:
# Everything is exactly the same - hopefully the scores come back 1.0

a = JOHN_DOE.copy()
b = JOHN_DOE.copy()

compare(a, b)

Unnamed: 0,given_name,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,John,Doe,2021-01-24,M,1234567890,123 Fake St,1234,mom@example.com,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Now let's see what happens when we change one thing, ie, John vs Jon

a = JOHN_DOE.copy()
a['given_name'] = 'Jon'
b = JOHN_DOE.copy()
b['given_name'] = 'John'

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Jon,John,Doe,2021-01-24,M,1234567890,123 Fake St,1234,mom@example.com,0.9289,0.9267,0.9444,0.9286,0.9278,0.9429


In [8]:
# Let's try the same thing with a longer name, Johnathan vs Jonathan
a = JOHN_DOE.copy()
a['given_name'] = 'Jonathan'
b = JOHN_DOE.copy()
b['given_name'] = 'Johnathan'

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Jonathan,Johnathan,Doe,2021-01-24,M,1234567890,123 Fake St,1234,mom@example.com,0.9661,0.9644,0.9723,0.9658,0.9644,0.971


In [9]:
# Let's try a 1-char difference only at the end
a = JOHN_DOE.copy()
a['given_name'] = 'Jonathan'
b = JOHN_DOE.copy()
b['given_name'] = 'Jonathann'

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Jonathan,Jonathann,Doe,2021-01-24,M,1234567890,123 Fake St,1234,mom@example.com,0.9822,0.9833,0.9863,0.9829,0.9835,0.9873


In [10]:
# Does adding more characters to a match affect the score?

# Let's try a 1-char difference only at the end
a = JOHN_DOE.copy()
a['given_name'] = 'Jonathan'
a['family_name'] = 'Richardson-Hernandez'
b = JOHN_DOE.copy()
b['given_name'] = 'Jonathann'
b['family_name'] = 'Richardson-Hernandez'

compare(a, b)


Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Jonathan,Jonathann,Richardson-Hernandez,2021-01-24,M,1234567890,123 Fake St,1234,mom@example.com,0.9833,0.9844,0.9872,0.984,0.9846,0.9883


In [11]:
# Let's look at some of the same things with a record with some blanks

a = MIN_RECORD.copy()
b = MIN_RECORD.copy()
compare(a,b)

Unnamed: 0,given_name,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Minuet,Riker,2006-06-06,F,,123 Fake St,1234,,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
a = MIN_RECORD.copy()
a['given_name'] = 'Minette'
b = MIN_RECORD.copy()
b['given_name'] = 'Minuet'

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Minette,Minuet,Riker,2006-06-06,F,,123 Fake St,1234,,0.8719,0.9058,0.9058,0.8719,0.9045,0.9252


In [13]:
# quick test to look at siblings
a = {
    'given_name': 'Michael',
    'family_name': 'Smith',
    'DOB': '2003-05-28',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Daniel',
    'family_name': 'Smith',
    'DOB': '2001-02-06',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB_L,DOB_R,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Michael,Daniel,Smith,2003-05-28,2001-02-06,M,,123 Fake St,1234,,0.69,0.7689,0.7689,0.69,0.7717,0.8198


In [14]:
# twins? v1
a = {
    'given_name': 'Tyler',
    'family_name': 'Smith',
    'DOB': '2008-05-28',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Mitchell',
    'family_name': 'Smith',
    'DOB': '2008-05-28',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Tyler,Mitchell,Smith,2008-05-28,M,,123 Fake St,1234,,0.7672,0.8239,0.8239,0.7672,0.8213,0.8572


In [15]:
# twins? v2

a = {
    'given_name': 'Thomas',
    'family_name': 'Smith',
    'DOB': '2008-05-28',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Elizabeth',
    'family_name': 'Smith',
    'DOB': '2008-05-28',
    'sex': 'F',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB,sex_L,sex_R,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Thomas,Elizabeth,Smith,2008-05-28,M,F,,123 Fake St,1234,,0.6238,0.7177,0.7177,0.6238,0.7188,0.7781


In [16]:
# Quick tests to see the impact of the sex field
# 1 - everything is the same except sex
quick_base = {
    'given_name': 'Cameron',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

a = quick_base.copy()
a['sex'] = 'M'
b = quick_base.copy()
b['sex'] = 'F'

compare(a, b)


Unnamed: 0,given_name,family_name,DOB,sex_L,sex_R,phone,address,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Cameron,Smith,2006-06-06,M,F,,123 Fake St,1234,,0.8233,0.8693,0.8693,0.8233,0.8733,0.9001


In [17]:
# 2 - everything is different except sex
a = {
    'given_name': 'Alexander',
    'family_name': 'Allen',
    'DOB': '1999-12-12',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}
b = {
    'given_name': 'Zach',
    'family_name': 'Young',
    'DOB': '2006-06-06',
    'sex': 'M',
    'household_street_address': '6 Jones way',
    'household_zip': '98879',
}

compare(a, b)

Unnamed: 0,given_name_L,given_name_R,family_name_L,family_name_R,DOB_L,DOB_R,sex,phone,address_L,address_R,zip_L,zip_R,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Alexander,Zach,Allen,Young,1999-12-12,2006-06-06,M,,123 Fake St,6 Jones way,1234,98879,,0.3515,0.3541,0.3541,0.3515,0.351,0.3692


# Zip Code Investigation

In [18]:
# Let's look at the impact of zip code. 
# Step 1, how much does a different zip code affect the score when everything else is the same?

# note the field name has to be "household_zip"
a = MIN_RECORD.copy()
a['household_zip'] = '01234'
b = MIN_RECORD.copy()
b['household_zip'] = '01235'

compare(a,b)


Unnamed: 0,given_name,family_name,DOB,sex,phone,address,zip_L,zip_R,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Minuet,Riker,2006-06-06,F,,123 Fake St,1234,1235,,1.0,0.9349,0.9349,1.0,1.0,0.9517


In [19]:
a = MIN_RECORD.copy()
a['household_zip'] = '01234'
b = MIN_RECORD.copy()
b['household_zip'] = '98765'

compare(a,b)

Unnamed: 0,given_name,family_name,DOB,sex,phone,address,zip_L,zip_R,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Minuet,Riker,2006-06-06,F,,123 Fake St,1234,98765,,1.0,0.8053,0.8053,1.0,1.0,0.8504


In [20]:
# Step 2, how much does a zipcode match add when other things are not the same?

a = {
    'given_name': 'John',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Lisa',
    'family_name': 'Jones',
    'DOB': '2007-07-07',
    'sex': 'F',
    'household_street_address': '987 Long Rd',
    'household_zip': '01234',
}

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name_L,family_name_R,DOB_L,DOB_R,sex_L,sex_R,phone,address_L,address_R,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,John,Lisa,Smith,Jones,2006-06-06,2007-07-07,M,F,,123 Fake St,987 Long Rd,1234,,0.3104,0.4943,0.4943,0.3104,0.3473,0.4918


In [21]:
# Same as above but with zips close 
a = {
    'given_name': 'John',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Lisa',
    'family_name': 'Jones',
    'DOB': '2007-07-07',
    'sex': 'F',
    'household_street_address': '987 Long Rd',
    'household_zip': '01256',
}

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name_L,family_name_R,DOB_L,DOB_R,sex_L,sex_R,phone,address_L,address_R,zip_L,zip_R,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,John,Lisa,Smith,Jones,2006-06-06,2007-07-07,M,F,,123 Fake St,987 Long Rd,1234,1256,,0.3104,0.4073,0.4073,0.3104,0.3473,0.4267


In [22]:
# Then one more with zips completely different
a = {
    'given_name': 'John',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'M',
    'household_street_address': '123 Fake St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Lisa',
    'family_name': 'Jones',
    'DOB': '2007-07-07',
    'sex': 'F',
    'household_street_address': '987 Long Rd',
    'household_zip': '98765',
}

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name_L,family_name_R,DOB_L,DOB_R,sex_L,sex_R,phone,address_L,address_R,zip_L,zip_R,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,John,Lisa,Smith,Jones,2006-06-06,2007-07-07,M,F,,123 Fake St,987 Long Rd,1234,98765,,0.3104,0.3217,0.3217,0.3104,0.3473,0.3672


In [23]:
# The scores are higher than I expected given how different the content of the 2 records appears to be
# Let's see how far they can be and still produce a score > 0.8

a = {
    'given_name': 'Alice',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'F',
    'household_street_address': '123 Fair St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Alison',
    'family_name': 'Stinson',
    'DOB': '2009-06-01',
    'sex': 'F',
    'household_street_address': '199 Main St',
    'household_zip': '01234',
}

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name_L,family_name_R,DOB_L,DOB_R,sex,phone,address_L,address_R,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Alice,Alison,Smith,Stinson,2006-06-06,2009-06-01,F,,123 Fair St,199 Main St,1234,,0.5987,0.7015,0.7015,0.5987,0.6005,0.6851


In [24]:
# We finally get there if we match the last name. 
# This does feel a little contrived but I think it's overall a realistic example
a = {
    'given_name': 'Alice',
    'family_name': 'Smith',
    'DOB': '2006-06-06',
    'sex': 'F',
    'household_street_address': '123 Fair St',
    'household_zip': '01234',
}

b = {
    'given_name': 'Alison',
    'family_name': 'Smith',
    'DOB': '2009-06-01',
    'sex': 'F',
    'household_street_address': '199 Main St',
    'household_zip': '01234',
}

compare(a,b)

Unnamed: 0,given_name_L,given_name_R,family_name,DOB_L,DOB_R,sex,phone,address_L,address_R,zip,parent_email,name-sex-dob-phone,name-sex-dob-zip,name-sex-dob-phone-zip,name-sex-dob-parents,name-sex-dob-addr,name-sex-dob-addr-zip
0,Alice,Alison,Smith,2006-06-06,2009-06-01,F,,123 Fair St,199 Main St,1234,,0.8051,0.8553,0.8553,0.8051,0.7514,0.8043
