# Recognize already existing persons in Geovistory from the BHP

In [1]:
import pandas as pd

import toolkit as tk

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.charts import waterfall_chart
import splink.duckdb.duckdb_comparison_library as cl

## Load data

Here we are going to load Geovistory's data on one side, and BHP's data on the other, the goal being of identifing exiting persons of the BHP into Geovistory.

In [2]:
persons_geov = pd.read_csv('../../data/persons-geov.csv', sep=';')
tk.set_types(persons_geov, {'name':'string', 'gender':'string', 'birth_year':'int', 'death_year':'int'})
persons_geov.reset_index(inplace=True)

print('Geovistory\'s persons:')
tk.infos(persons_geov)

Geovistory's persons:
Shape:  (147644, 7)


Unnamed: 0,index,pk,name,gender,birth_year,death_year,dataset
0,0,312220,bernoulli geb. baer elisabeth,,1796,,geov
1,1,312221,weiss ursula,,1800,,geov
2,2,312222,fenner hermann robert,,1859,,geov
3,3,312223,middleton sophia,,1813,,geov
4,4,312224,weil anna,,1836,,geov


In [4]:
persons_bhp = pd.read_csv('../../data/persons-bhp.csv', sep=';')
tk.set_types(persons_bhp, {'name':'string', 'gender':'string', 'birth_year':'int', 'death_year':'int'})
persons_bhp.reset_index(inplace=True)

print('BHP\'s persons:')
tk.infos(persons_bhp.drop(columns=['first_name', 'last_name']))

BHP's persons:
Shape:  (62528, 7)


Unnamed: 0,index,pk,name,gender,birth_year,death_year,dataset
0,0,44895,antoine sainte-marie perrin,Male,1870.0,1930,bhp
1,1,47015,,Male,1506.0,1545,bhp
2,2,47190,alberto duimio,Male,1510.0,1564,bhp
3,3,47190,albertus divini,Male,1510.0,1564,bhp
4,4,47578,angelo zampa,Male,,1575,bhp


## Settings

### Prediction generation blocking rules

Here are the set blocking rules to generate the pairwise comparisons; in other words all record couples that do not statisfied at least one of those rules will not be considered in final predictions. 

This is so to avoid to have to do all comparisons which would be heavy (would take forever) for a computer to do. In our case it would be length of BHP data times length of Geovistory data times number of column, so 62k times 147k times 4 = 37G comparisons.


In [4]:
blocking_rules_predictions = [
    "l.birth_year = r.birth_year and levenshtein(l.name, r.name) <= 3",
    "l.death_year = r.death_year and levenshtein(l.name, r.name) <= 3",
]

for br in blocking_rules_predictions:
    print(f"Blocking rule: <{br}>")

Blocking rule: <l.birth_year = r.birth_year and levenshtein(l.name, r.name) <= 3>
Blocking rule: <l.death_year = r.death_year and levenshtein(l.name, r.name) <= 3>


### Final comparisons

We also set multiple comparison rules, which describe how the comparison will be executed on 2 records.

In [5]:
comparisons = [
    # cl.levenshtein_at_thresholds("name", 1), # Because we want matches with only a typo be more close to a matching than is it is another spelling
    cl.levenshtein_at_thresholds("name", 3), # If a name has another spelling (like phonetics)
    cl.exact_match("gender"), # Because we have controlled vocabulary for the gender
    cl.exact_match("birth_year"), # This takes the assumption that there is no typo possible on birth year
    cl.exact_match("death_year"), # This takes the assumption that there is no typo possible on death year
]

for br in comparisons:
    print(f"{br}")

<Comparison Exact match vs. levenshtein at threshold 3 vs. anything else with 3 levels at 0x7f9115025b20>
<Comparison Exact match vs. anything else with 2 levels at 0x7f9115025cd0>
<Comparison Exact match vs. anything else with 2 levels at 0x7f9115025fd0>
<Comparison Exact match vs. anything else with 2 levels at 0x7f9115025f10>


In [6]:
settings = {
    "link_type": "link_only", # Describe the fact that we want to merge 2 dataframe and that one may already have some of the second one.
    "unique_id_column_name": "index", # Each dataframe has to have a unique key for each line, here we tell Splink, what is the name of the column
    "blocking_rules_to_generate_predictions": blocking_rules_predictions,
    "comparisons": comparisons,
    "retain_matching_columns": True, # To have waterfall charts
    "retain_intermediate_calculation_columns": True, # To have waterfall charts
}

## The Model

### Creating the model

The next chart displays how much final comparisons the model will have to predict. Basically this total number will be an upper boundary of the final prediction table length.

In [7]:
linker = DuckDBLinker(
    [persons_geov, persons_bhp], 
    settings, 
    input_table_aliases=["geov", "bhp"] # To have custom names in comparison table
)

linker.cumulative_num_comparisons_from_blocking_rules_chart()

### Training blocking rule

In order to train our model, we need to estimate the $m$ and $u$ parameters of the Fellegi-Sunter model associated with and EM algorithm (more of that [here](https://www.robinlinacre.com/maths_of_fellegi_sunter/) and [here](https://www.robinlinacre.com/em_intuition/)).

The $u$ parameter will be estimated using random sampling. This is valid (as explained [here](https://moj-analytical-services.github.io/splink/linker.html#splink.linker.Linker.estimate_u_using_random_sampling)) because there is a very low probability for 2 randomly picked records to be the same person. We just have to make sure that the sample taken is large enough to correctly train this parameter.

For the $m$ parameter, to have a powerfull model, we can not take such an hypothesis; we need to train the model (statistically) on the data. As before, since we can not take the full data, we need to filter out comparisons so that it is manageable in a reasonnable time. See the next display to see rules we took. 

How to interpret those rules? Basically, for all column (present in the comparison rules above) not being in a training blocking rules, we will estimate the $m$ parameter on pairwise comparisons validated by the rule. In other word, if the `gender` column does not appear in the rule, it means that the $m$ parameter for the column `gender` will be trained (calculated) on all the pairwaise comparisons remaining after beeing filter by the rule.

This also implies that all columns MUST not appear in at least one rule, otherwise we can never train the $m$ parameter.

The training blocking rules:

In [8]:
blocking_rules_training = [
    "l.birth_year = r.birth_year and levenshtein(l.name, r.name) <= 3",
    "l.death_year = r.death_year and levenshtein(l.name, r.name) <= 3",
    "l.birth_year = r.birth_year and l.death_year = r.death_year",
]

for br in blocking_rules_training:
    print(f"Blocking rule: <{br}>")

Blocking rule: <l.birth_year = r.birth_year and levenshtein(l.name, r.name) <= 3>
Blocking rule: <l.death_year = r.death_year and levenshtein(l.name, r.name) <= 3>
Blocking rule: <l.birth_year = r.birth_year and l.death_year = r.death_year>


### Training phase

In [9]:
z = linker.estimate_u_using_random_sampling(target_rows=2e7)

for br in blocking_rules_training:
    z = linker.estimate_parameters_using_expectation_maximisation(br)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name (no m values are trained).
    - gender (no m values are trained).
    - birth_year (no m values are trained).
    - death_year (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.birth_year = r.birth_year and levenshtein(l.name, r.name) <= 3

Parameter estimates will be made for the following comparison(s):
    - gender
    - death_year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - name
    - birth_year

Iteration 1: Largest change in params was -0.2 in the m_probability of gender, level `Exact match`
Iteration 2: Largest change in params was 1.77e-06 in the m_probability of gender, level `All other comparisons`

EM converged after 2 iterations

Your model 

## Predictions

### What has been learned?

Now that we have our model trained, first, lets look at what did the model learn about our data:

In [10]:
linker.match_weights_chart()

Here we can observe how a response level (on the left) influences the matching probability.

More particularly we observe that the gender has a very low influence on the result, just on the contrary of the name.

### Persons identified

The next table is an extract of the 50 most probable matchings.
Each line represents a pairwise comparison. We can see the probability (computed by the trained model), and each column put aside another in order to be more human readable. A copy is available as a CSV table in the `data` folder.

In [11]:
# results = linker.predict(threshold_match_probability=0.9)
results = linker.predict()
results_df = results.as_pandas_dataframe().sort_values(by='match_probability', ascending=False)
print('Result number:', len(results_df))

readable = results_df[['match_probability', 'source_dataset_l', 'source_dataset_r', 'index_l', 'index_r', 'name_l', 'name_r', 'gender_l', 'gender_r', 'birth_year_l', 'birth_year_r', 'death_year_l', 'death_year_r']].copy()
readable.rename(columns={
    'match_probability':'proba', 
    'index_l': 'index_bhp', 
    'index_r': 'index_geov',
    'name_l': 'bhp_name',
    'name_r': 'geov_name',
    'gender_l': 'bhp_gender',
    'gender_r': 'geov_gender',
    'birth_year_l': 'bhp_birth_year',
    'birth_year_r': 'geov_birth_year',
    'death_year_l': 'bhp_death_year',
    'death_year_r': 'geov_death_year'
}, inplace=True)
tk.set_types(readable, {
    'bhp_birth_year': 'int',
    'geov_birth_year': 'int',
    'bhp_death_year': 'int',
    'geov_death_year': 'int',
})
readable['proba'] = [tk.percent(p) for p in readable['proba']]
readable.drop(columns=['source_dataset_l', 'source_dataset_r'], inplace=True)

Result number: 1127


In [14]:
# Make the link against private keys of BHP and GEOV
readable = readable.merge(persons_bhp[['index', 'pk']], left_on='index_bhp', right_on='index', how='left').rename(columns={'pk': 'pk_bhp'}).drop(columns=['index'])
readable = readable.merge(persons_geov[['index', 'pk']], left_on='index_geov', right_on='index', how='left').rename(columns={'pk': 'pk_geov'}).drop(columns=['index'])
readable = readable[['proba', 'pk_bhp', 'pk_geov', 'bhp_name', 'geov_name', 'bhp_gender', 'geov_gender', 'bhp_birth_year', 'geov_birth_year', 'bhp_death_year', 'geov_death_year']]

readable.drop_duplicates(subset=['pk_bhp', 'pk_geov'], inplace=True)

readable.to_csv('../../data/bhp_geov_entity_recognition.csv', sep=";", index=False, quoting=2)
tk.infos(readable)

Shape:  (1079, 11)


Unnamed: 0,proba,pk_bhp,pk_geov,bhp_name,geov_name,bhp_gender,geov_gender,bhp_birth_year,geov_birth_year,bhp_death_year,geov_death_year
0,100.00%,59665,1645075,gabriele condulmer,gabriele condulmer,Male,Male,1383,1383,1447,1447
1,100.00%,47734,149826,marcel abraham,marcel abraham,Male,Male,1898,1898,1955,1955
2,100.00%,200,869757,johann caspar lavater,johann caspar lavater,,,1741,1741,1801,1801
3,100.00%,51658,899617,johann heinrich tieftrunk,johann heinrich tieftrunk,Male,,1759,1759,1837,1837
4,100.00%,51666,869527,ulrich hegner,ulrich hegner,Male,,1759,1759,1840,1840


### Details

The next chart gives us details about a particular pairwise comparison. The example can be changed with the bottom slider. More information are available on mouse over the different elements.

This chart helps us understand why the model answered the provided response.

For conveniance, only the first 1000 comparisons are available through this chart.

In [15]:
records_to_plot = results_df.head(1000).to_dict(orient="records")
linker.waterfall_chart(records_to_plot, filter_nulls=False)