In [1]:
import pandas as pd

import toolkit as tk
import geovpylib as gv

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
)

# BHP'actors persons record linkage

### Get data

In [2]:
persons = pd.read_csv('../../data/persons-bhp.csv', sep=';').drop(columns=['dataset', 'name'])
persons = persons.reset_index().rename(columns={'index': 'unique_id'})
tk.set_types(persons, {'first_name':'string', 'last_name':'string', 'gender':'string', 'birth_year':'int', 'death_year':'int'})
tk.infos(persons)

Shape:  (62528, 7)


Unnamed: 0,unique_id,pk,first_name,last_name,gender,birth_year,death_year
0,0,44895,antoine,sainte-marie perrin,Male,1870.0,1930
1,1,47015,,,Male,1506.0,1545
2,2,47190,alberto,duimio,Male,1510.0,1564
3,3,47190,albertus,divini,Male,1510.0,1564
4,4,47578,angelo,zampa,Male,,1575


### Settings & Verifications

In [3]:
persons_sample = persons

brs = [
    'levenshtein(l.last_name, r.last_name) <= 3 and levenshtein(l.birth_year, r.birth_year) <= 1'
]


settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": brs,
    "comparisons": [
        levenshtein_at_thresholds("first_name", 2),
        levenshtein_at_thresholds("last_name", 2),
        exact_match("gender"),
        exact_match("birth_year"),
        exact_match("death_year"),
    ],
}

linker = DuckDBLinker(persons_sample, settings)

linker.cumulative_num_comparisons_from_blocking_rules_chart()

### Training

In [4]:
linker.estimate_u_using_random_sampling(target_rows=1e6)

brs = [
    'levenshtein(l.last_name, r.last_name) <= 3 and l.birth_year = r.birth_year',
    'levenshtein(l.first_name, r.first_name) <= 3 and levenshtein(l.last_name, r.last_name) <= 3 and l.gender = r.gender',
    'l.first_name = r.first_name and l.gender = r.gender and l.birth_year = r.birth_year and l.death_year = r.death_year',

]

for br in brs:
    linker.estimate_parameters_using_expectation_maximisation(br)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - last_name (no m values are trained).
    - gender (no m values are trained).
    - birth_year (no m values are trained).
    - death_year (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
levenshtein(l.last_name, r.last_name) <= 3 and l.birth_year = r.birth_year

Parameter estimates will be made for the following comparison(s):
    - first_name
    - gender
    - death_year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - last_name
    - birth_year

Iteration 1: Largest change in params was -0.88 in the m_probability of first_name, level `Exact match`
Iteration 2: Largest change in params was -0.086 in the m_probability of

In [5]:
linker.match_weights_chart()

In [6]:
linker.m_u_parameters_chart()

In [45]:
# results = linker.predict(threshold_match_probability=0.9)
results = linker.predict()
results_df = results.as_pandas_dataframe().sort_values(by='match_probability', ascending=False)
print('Result number:', len(results_df))

Result number: 1004748


In [46]:
results_df = results_df.merge(persons[['unique_id', 'pk']], left_on='unique_id_l', right_on='unique_id', how='left').rename(columns={'pk': 'pk_l'}).drop(columns=['unique_id_l', 'unique_id'])
results_df = results_df.merge(persons[['unique_id', 'pk']], left_on='unique_id_r', right_on='unique_id', how='left').rename(columns={'pk': 'pk_r'}).drop(columns=['unique_id_r', 'unique_id'])


readable = results_df[['match_probability', 'pk_l', 'pk_r', 'first_name_l', 'first_name_r', 'last_name_l', 'last_name_r', 'gender_l', 'gender_r', 'birth_year_l', 'birth_year_r', 'death_year_l', 'death_year_r']].copy()
readable.rename(columns={
    'match_probability':'score', 
    'unique_id_l': 'pk_bhp_l', 
    'unique_id_r': 'pk_bhp_r'
}, inplace=True)
tk.set_types(readable, {
    'birth_year_l': 'int',
    'birth_year_r': 'int',
    'death_year_l': 'int',
    'death_year_r': 'int',
})

# Cleaning
readable.drop_duplicates(subset=['pk_l', 'pk_r'], inplace=True)
readable = readable[readable['pk_l'] != readable['pk_r']]

readable.reset_index(drop=True, inplace=True)
print(readable.shape)
readable = readable[readable['score'] > 0.4] # manually set
readable.to_csv('../../data/bhp_entity_recognition.csv', sep=";", index=False, quoting=2)
tk.infos(readable)

(951141, 13)
Shape:  (3174, 13)


Unnamed: 0,score,pk_l,pk_r,first_name_l,first_name_r,last_name_l,last_name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r
0,0.999036,60733,62293,h.,h.,bazinet,ravinet,Male,Male,1825,1825,1901,1901
1,0.999036,38630,38643,polycarp,polycarp,heylandt,heiland,Male,Male,1651,1651,1702,1702
2,0.999036,34434,34402,henri,henri,perrot,perrenot,Male,Male,1857,1857,1925,1925
3,0.999036,11580,12944,pierre,pierre,dinoux,dimoux,Male,Male,1750,1750,1825,1825
4,0.999036,43475,46113,elisabeth,elisabeth,baillon,baillou,Female,Female,1613,1613,1677,1677
