In [35]:
bhp_persons_path = '../../data/persons-bhp.csv'
trigram_score_threshold = 0.9

In [36]:
import math
import pandas as pd
import duckdb
import geovpylib.utils as u
import geovpylib.analysis as a

eta = u.Eta()

# BHP'actors persons record linkage

## Get data

In [37]:
persons = u.read_df(bhp_persons_path)
a.set_types(persons, {'birth_year': 'int', 'death_year': 'int', 'certainty_birth': 'int', 'certainty_death': 'int'})
u.infos(persons)

Shape:  (62527, 10)


Unnamed: 0,pk,name,first_name,last_name,gender,birth_year,certainty_birth,death_year,certainty_death,dataset
0,44895.0,antoine sainte-marie perrin,antoine,sainte-marie perrin,Male,1870.0,3,1930,3,bhp
1,47015.0,,,,Male,1506.0,1,1545,1,bhp
2,47190.0,alberto duimio,alberto,duimio,Male,1510.0,1,1564,1,bhp
3,47190.0,albertus divini,albertus,divini,Male,1510.0,1,1564,1,bhp
4,47578.0,angelo zampa,angelo,zampa,Male,,0,1575,0,bhp


## Find duplicated

In [38]:
def find_dupplicated(df, additional_columns, criterion):
    """According to the given criterion, add the found entities to the dataframe."""

    found = pd.concat([df, duckdb.query(f"""
        select 
            p1.pk as pk_l,
            p2.pk as pk_r,
            p1.name as name_l,
            p2.name as name_r,
            p1.gender as gender_l,
            p2.gender as gender_r,
            p1.birth_year as birth_year_l,
            p2.birth_year as birth_year_r,
            p1.death_year as death_year_l,
            p2.death_year as death_year_r,
            {additional_columns}
        from persons p1 inner join persons p2 on p1.pk != p2.pk and {criterion}
    """).to_df()
    ])

    a.set_types(found, {'pk_l':'int', 'pk_r':'int'})

    # To de duplicate the couples
    found['key'] = [str(min(row['pk_l'], row['pk_r'])) + '-' + str(max(row['pk_l'], row['pk_r'])) for i, row in found.iterrows()]
    found.drop_duplicates(subset=['key'], inplace=True, keep='first')
    found.drop(columns=['key'], inplace=True)
    found.reset_index(drop=True, inplace=True)

    return found


### 1/ Gender, birth year, death year matches exactly

#### Name Levenshtein distance = 0

In [39]:
matches = find_dupplicated(
    pd.DataFrame(), 
    "1 as level, 0 as levenshtein", 
    "p1.name = p2.name and p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year"
)

u.infos(matches[(matches['level'] == 1) & (matches['levenshtein'] == 0)])

Shape:  (27, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
0,15155,15156,jean juvenel,jean juvenel,Male,Male,1643,1643,1675,1675,1,0
1,20092,20093,? delort,? delort,Male,Male,1641,1641,1641,1641,1,0
2,2886,49782,pietro duodo,pietro duodo,Male,Male,1554,1554,1610,1610,1,0
3,13313,22099,jean-jacques dobler,jean-jacques dobler,Male,Male,1749,1749,1838,1838,1,0
4,51744,51736,francesco roncalli parolino,francesco roncalli parolino,Male,Male,1692,1692,1763,1763,1,0


#### Name Levenshtein distance = 1

In [40]:
matches = find_dupplicated(
    matches, 
    "1 as level, 1 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 1"
)

u.infos(matches[(matches['level'] == 1) & (matches['levenshtein'] == 1)])

Shape:  (24, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
27,47825,50830,louis nicolas neuville de villeroy,louis nicolas neufville de villeroy,Male,Male,1663,1663,1734,1734,1,1
28,1452,57598,jean-paul la roque,jean-paul la roque,Male,Male,1630,1630,1691,1691,1,1
29,50150,820,guglielmo grataroli,guglielmo gratarolo,Male,Male,1516,1516,1568,1568,1,1
30,55242,55277,felix prudho,felix prudlo,Male,Male,1794,1794,1837,1837,1,1
31,56121,38579,konrad victor schneider,conrad victor schneider,Male,Male,1614,1614,1680,1680,1,1


#### Name Levenshtein distance = 2

In [41]:
matches = find_dupplicated(
    matches, 
    "1 as level, 2 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 2"
)

u.infos(matches[(matches['level'] == 1) & (matches['levenshtein'] == 2)])

Shape:  (24, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
51,51896,60,ismael bouilliau,ismal boulliau,Male,Male,1605,1605,1694,1694,1,2
52,243,47426,michele mazarino,michele mazzarini,Male,Male,1605,1605,1648,1648,1,2
53,54053,54073,adrien marie le jendre,adrien marie legendre,Male,Male,1752,1752,1833,1833,1,2
54,56974,47395,jos fernandez del toro,jos fernandez del toro,Male,Male,1663,1663,1733,1733,1,2
55,56975,47396,juan antonio llorente,juan antonio llorente,Male,Male,1750,1750,1823,1823,1,2


#### Name Levenshtein distance = 3

In [42]:
matches = find_dupplicated(
    matches, 
    "1 as level, 3 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 3"
)

u.infos(matches[(matches['level'] == 1) & (matches['levenshtein'] == 3)])

Shape:  (70, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
75,16776,17231,? socin,? kocha,Male,Male,1718,1718,1718,1718,1,3
76,17735,17775,pierre moine,pierre marie,Male,Male,1632,1632,1632,1632,1,3
77,19984,19987,charles merlan,charles martin,Male,Male,1635,1635,1636,1636,1,3
78,20023,20044,? pelas,? meilan,Male,Male,1638,1638,1638,1638,1,3
79,19985,19989,? blachon,? perachon,Male,Male,1636,1636,1636,1636,1,3


### 2/ Gender, birth year matches exactly

#### Name Levenshtein distance = 0

In [43]:
matches = find_dupplicated(
    matches, 
    "2 as level, 0 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 0"
)

u.infos(matches[(matches['level'] == 2) & (matches['levenshtein'] == 0)])

Shape:  (111, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
145,14325,14326,susanne crommelin,susanne crommelin,Female,Female,1676,1676,,1676.0,2,0
146,14695,14700,jean rodolphe frey,jean rodolphe frey,Male,Male,1662,1662,,,2,0
147,14023,14024,jacques brutel,jacques brutel,Male,Male,1666,1666,,,2,0
148,14056,14057,isaac burel,isaac burel,Male,Male,1671,1671,1677.0,,2,0
149,14021,14022,gdon brutel,gdon brutel,Male,Male,1665,1665,1668.0,,2,0


#### Name Levenshtein distance = 1

In [44]:
matches = find_dupplicated(
    matches, 
    "2 as level, 1 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 1"
)

u.infos(matches[(matches['level'] == 2) & (matches['levenshtein'] == 1)])

Shape:  (80, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
256,13850,14052,marie bernaud,marie beraud,Female,Female,1646,1646,,,2,1
257,13828,15045,susanne berard,susanne beraud,Female,Female,1671,1671,,,2,1
258,13760,13852,franoise barnaud,franoise bernaud,Female,Female,1680,1680,,1681.0,2,1
259,8601,6477,joseph jaillet,joseph paillet,Male,Male,1884,1884,,,2,1
260,15417,15420,pierre maille,pierre maillet,Male,Male,1680,1680,,,2,1


#### Name Levenshtein distance = 2

In [45]:
matches = find_dupplicated(
    matches, 
    "2 as level, 2 as levenshtein", 
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 2"
)

u.infos(matches[(matches['level'] == 2) & (matches['levenshtein'] == 2)])

Shape:  (244, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
336,13978,57934,salomon breuiller,salomon brevillet,Male,Male,1647,1647,,,2,2
337,25200,24595,camille garassu,camille garasson,Female,Female,1839,1839,,,2,2
338,25202,24754,adle thom,adle thomet,Female,Female,1820,1820,,,2,2
339,13766,49828,marie baron,marie bathon,Female,Female,1645,1645,,,2,2
340,15046,30848,marie hoxe,marie roe,Female,Female,1634,1634,1672.0,1681.0,2,2


### 3/ Birth year matches exactly

#### Name Levenshtein distance = 0

In [46]:
matches = find_dupplicated(
    matches, 
    "3 as level, 0 as levenshtein", 
    "p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 0"
)

u.infos(matches[(matches['level'] == 3) & (matches['levenshtein'] == 0)])

Shape:  (25, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
580,14731,14732,? gainon,? gainon,Male,Female,1676,1676,1676.0,,3,0
581,16729,16730,? rimmacher,? rimmacher,Male,Female,1707,1707,,1707.0,3,0
582,16760,16761,? croom,? croom,Male,Female,1715,1715,,1715.0,3,0
583,15992,15993,? renaud,? renaud,Female,Male,1661,1661,,1678.0,3,0
584,29003,29002,paul fleurieu,paul fleurieu,Female,Male,1860,1860,,,3,0


#### Name Levenshtein distance = 1

In [47]:
matches = find_dupplicated(
    matches, 
    "3 as level, 1 as levenshtein", 
    "p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 1"
)

u.infos(matches[(matches['level'] == 3) & (matches['levenshtein'] == 1)])

Shape:  (8, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
605,18539,18540,louis moze,louise moze,Male,Female,1655,1655,,,3,1
606,137,43258,pierre dumoulin,pierre du moulin,,Male,1568,1568,1658.0,1658.0,3,1
607,1262,45691,louis la rochefoucaud,louis la rochefoucauld,,Male,1615,1615,1654.0,1654.0,3,1
608,25232,25233,emilie clerc,emile clerc,Female,Male,1837,1837,,,3,1
609,51302,1162,franois citoys,franois citois,Male,,1572,1572,1652.0,1652.0,3,1


#### Name Levenshtein distance = 2

In [48]:
matches = find_dupplicated(
    matches, 
    "3 as level, 2 as levenshtein", 
    "p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 2"
)

u.infos(matches[(matches['level'] == 3) & (matches['levenshtein'] == 2)])

Shape:  (40, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
613,14014,14054,? brutel,? burel,Male,Female,1675,1675,1675.0,1675.0,3,2
614,17075,17117,louise larsailler,louise lansuiller,Female,Male,1667,1667,,,3,2
615,17199,17201,anne josserand,andr josserand,Female,Male,1642,1642,,1670.0,3,2
616,16631,16668,anne mallein,andr mallein,Female,Male,1684,1684,,1684.0,3,2
617,16637,16654,jean dufour,jeanne dufour,Male,Female,1684,1684,,,3,2


### 4/ Death year matches exactly

#### Name Levenshtein distance = 0

In [49]:
matches = find_dupplicated(
    matches, 
    "4 as level, 0 as levenshtein", 
    "p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 0"
)

u.infos(matches[(matches['level'] == 4) & (matches['levenshtein'] == 0)])

Shape:  (39, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
653,14868,43626,jacques got,jacques got,Male,Male,1672.0,1618.0,1679.0,1679.0,4,0
654,14458,18517,paul delor,paul delor,Male,Male,1602.0,1655.0,1658.0,1658.0,4,0
655,14218,14219,no clot,no clot,Male,Male,1634.0,1668.0,1670.0,1670.0,4,0
656,43774,43771,marcus antonius,marcus antonius,Male,Male,192.0,158.0,238.0,238.0,4,0
657,16707,21835,guillaume metzger,guillaume metzger,Male,Male,1700.0,,1700.0,1700.0,4,0


#### Name Levenshtein distance = 1

In [50]:
matches = find_dupplicated(
    matches, 
    "4 as level, 1 as levenshtein", 
    "p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 1"
)

u.infos(matches[(matches['level'] == 4) & (matches['levenshtein'] == 1)])

Shape:  (34, 12)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level,levenshtein
692,16748,21840,jean albert finguerlin,jean albert fingerlin,Male,Male,1706.0,,1712.0,1712.0,4,1
693,17998,18555,thodore keller,theodore keller,Male,Male,1612.0,1656.0,1657.0,1657.0,4,1
694,19229,1273,jacques aubert,jacques jubert,Male,,1573.0,,1655.0,1655.0,4,1
695,60448,60554,louis gabriel oescher,louis gabriel oeschger,Male,Male,1820.0,1819.0,1887.0,1887.0,4,1
696,52272,52450,johann wolf,johann wolff,Male,Male,1537.0,1577.0,1616.0,1616.0,4,1


## Save

In [53]:
u.save_df(matches, '../../data/bhp_entity_recognition.csv')