In [46]:
import pandas as pd

import toolkit as tk

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb.duckdb_comparison_library import (
    exact_match,
    levenshtein_at_thresholds,
    
)

# Persons Record linkage

### Get data

In [47]:
persons = pd.read_csv('../../data/persons-bhp-geov.csv', sep=';')
tk.set_types(persons, {'name':'string', 'gender':'string', 'birth_year':'int', 'death_year':'int'})
persons['unique_id'] = persons['dataset'] + persons['unique_id'].astype(str)
tk.infos(persons)

Shape:  (210172, 7)


Unnamed: 0,unique_id,id,name,gender,birth_year,death_year,dataset
0,bhp0,44895,antoine sainte-marie perrin,Male,1870.0,1930,bhp
1,bhp1,47015,,Male,1506.0,1545,bhp
2,bhp2,47190,alberto duimio,Male,1510.0,1564,bhp
3,bhp3,47190,albertus divini,Male,1510.0,1564,bhp
4,bhp4,47578,angelo zampa,Male,,1575,bhp


### Settings & Verifications

In [48]:
persons_sample = persons#.sample(150000, random_state=42)

br1 = 'l.name = r.name and l.birth_year = r.birth_year'
br2 = 'l.name = r.name and l.death_year = r.death_year'
br3 = 'l.birth_year = r.birth_year and l.death_year = r.death_year and l.gender = r.gender'

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [br1, br2, br3],
    "comparisons": [
        levenshtein_at_thresholds("name", 1),
        exact_match("gender"),
        exact_match("birth_year"),
        exact_match("death_year"),
    ],
}

# linker = DuckDBLinker(persons_sample, settings)
linker = DuckDBLinker(persons_sample, settings)

linker.cumulative_num_comparisons_from_blocking_rules_chart()

### Training

In [53]:
blocking_rule_for_training = br1 #"l.name = r.name and l.birth_year = r.birth_year"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = br2 #"l.name = r.name and l.death_year = r.death_year"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

blocking_rule_for_training = br3 #"l.birth_year = r.birth_year"
linker.estimate_parameters_using_expectation_maximisation(blocking_rule_for_training)

linker.estimate_u_using_random_sampling(target_rows=1e6)


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.name = r.name and l.birth_year = r.birth_year

Parameter estimates will be made for the following comparison(s):
    - gender
    - death_year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - name
    - birth_year

Iteration 1: Largest change in params was 0.176 in the m_probability of death_year, level `Exact match`
Iteration 2: Largest change in params was 0.00251 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.00412 in probability_two_random_records_match
Iteration 4: Largest change in params was 0.00669 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.0107 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.017 in probability_two_random_records_match
Iteration 7: Largest change in params was 0.0263 in probabil

In [54]:
linker.match_weights_chart()

In [55]:
linker.m_u_parameters_chart()

In [56]:
linker.predict().as_pandas_dataframe().sort_values('match_probability', ascending=False).head(50)#[0:1000].sample(10)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,name_l,name_r,gamma_name,gender_l,gender_r,gamma_gender,birth_year_l,birth_year_r,gamma_birth_year,death_year_l,death_year_r,gamma_death_year,match_key
245947,7.717797,0.995272,bhp36032,bhp36034,polycarp wilhelm bucholtz,polycarp wilhelm buchholtz,1,Male,Male,1,1649.0,1649.0,1,1663.0,1663.0,1,2
244788,7.717797,0.995272,bhp36203,bhp36235,polycarp heiland,polycarp heilandt,1,Male,Male,1,1651.0,1651.0,1,1702.0,1702.0,1,2
518475,7.717797,0.995272,bhp56048,bhp56049,jean hermann widerhold,jean hermann widerholdt,1,Male,Male,1,1635.0,1635.0,1,1683.0,1683.0,1,2
289497,7.717797,0.995272,bhp39931,bhp39932,maria burckhart,maria burckhard,1,Female,Female,1,1631.0,1631.0,1,1650.0,1650.0,1,2
289501,7.717797,0.995272,bhp39937,bhp39938,maria starck,maria starcke,1,Female,Female,1,1582.0,1582.0,1,1623.0,1623.0,1,2
344196,7.717797,0.995272,bhp51564,bhp51565,giovanni benedetto zuanelli,giovanni benedetto zuannelli,1,Male,Male,1,1669.0,1669.0,1,1738.0,1738.0,1,2
344195,7.717797,0.995272,bhp51559,bhp51560,francesco stancari,francesco stancaro,1,Male,Male,1,1501.0,1501.0,1,1574.0,1574.0,1,2
518487,7.717797,0.995272,bhp56087,bhp56088,etienne thouvenot,etienne touvenot,1,Male,Male,1,1624.0,1624.0,1,1695.0,1695.0,1,2
344194,7.717797,0.995272,bhp51555,bhp51556,louise gaulard,louise goulard,1,Female,Female,1,1560.0,1560.0,1,1596.0,1596.0,1,2
344186,7.717797,0.995272,bhp51494,bhp51496,bonifazio carretto,bonifazio caretto,1,Male,Male,1,1260.0,1260.0,1,1306.0,1306.0,1,2
