In [1]:
bhp_persons_path = '../../data/persons-bhp.csv'
trigram_score_threshold = 0.9

In [2]:
import math
import pandas as pd
import duckdb
import geovpylib.utils as u
import geovpylib.analysis as a

eta = u.Eta()

# BHP'actors persons record linkage

## Get data

In [3]:
persons = u.read_df(bhp_persons_path)
a.set_types(persons, {'birth_year': 'int', 'death_year': 'int', 'certainty_birth': 'int', 'certainty_death': 'int'})
u.infos(persons)

Shape:  (62523, 10)


Unnamed: 0,pk,name,first_name,last_name,gender,birth_year,certainty_birth,death_year,certainty_death,dataset
0,44895.0,antoine sainte-marie perrin,antoine,sainte-marie perrin,Male,,0,,0,bhp
1,47015.0,,,,Male,1506.0,1,,1,bhp
2,47190.0,alberto duimio,alberto,duimio,Male,1510.0,1,1564.0,1,bhp
3,47190.0,albertus divini,albertus,divini,Male,1510.0,1,1564.0,1,bhp
4,47578.0,angelo zampa,angelo,zampa,Male,,0,1575.0,0,bhp


## Find duplicated

In [4]:
def find_dupplicated(df, additional_columns, criterion, where=False):
    """According to the given criterion, add the found entities to the dataframe."""

    sql = f"""
        select 
            p1.pk as pk_l,
            p2.pk as pk_r,
            p1.name as name_l,
            p2.name as name_r,
            p1.gender as gender_l,
            p2.gender as gender_r,
            p1.birth_year as birth_year_l,
            p2.birth_year as birth_year_r,
            p1.death_year as death_year_l,
            p2.death_year as death_year_r,
            {additional_columns}
        from persons p1 inner join persons p2 on p1.pk != p2.pk and {criterion}
    """
    if where: sql += f" where {where}"
    # print(sql)

    found = pd.concat([df, duckdb.query(sql).to_df()])
    a.set_types(found, {'pk_l':'int', 'pk_r':'int'})

    # To de duplicate the couples
    found['key'] = [str(min(row['pk_l'], row['pk_r'])) + '-' + str(max(row['pk_l'], row['pk_r'])) for i, row in found.iterrows()]
    found.drop_duplicates(subset=['key'], inplace=True, keep='first')
    found.drop(columns=['key'], inplace=True)
    found.reset_index(drop=True, inplace=True)

    return found

matches = pd.DataFrame()

### 0/ Every thing match exactly

In [5]:
level = 0

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.name = p2.name and p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year"
)

u.infos(matches[matches['level'] == level])

Shape:  (22, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
0,15155,15156,jean juvenel,jean juvenel,Male,Male,1643,1643,1675,1675,0
1,20092,20093,? delort,? delort,Male,Male,1641,1641,1641,1641,0
2,2886,49782,pietro duodo,pietro duodo,Male,Male,1554,1554,1610,1610,0
3,13313,22099,jean-jacques dobler,jean-jacques dobler,Male,Male,1749,1749,1838,1838,0
4,51744,51736,francesco roncalli parolino,francesco roncalli parolino,Male,Male,1692,1692,1763,1763,0


### 1/ Every thing match exactly except the name (levenshtein = 1)

In [6]:
level = 1

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 1"
)

u.infos(matches[matches['level'] == level])

Shape:  (15, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
22,47825,50830,louis nicolas neuville de villeroy,louis nicolas neufville de villeroy,Male,Male,1663,1663,1734,1734,1
23,50150,820,guglielmo grataroli,guglielmo gratarolo,Male,Male,1516,1516,1568,1568,1
24,55242,55277,felix prudho,felix prudlo,Male,Male,1794,1794,1837,1837,1
25,56121,38579,konrad victor schneider,conrad victor schneider,Male,Male,1614,1614,1680,1680,1
26,55266,55250,william whewell,william wheewell,Male,Male,1794,1794,1866,1866,1


### 2/ Every thing match exactly except the name (levenshtein = 2)

In [7]:
level = 2

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 2"
)

u.infos(matches[matches['level'] == level])

Shape:  (7, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
37,51896,60,ismael bouilliau,ismal boulliau,Male,Male,1605,1605,1694,1694,2
38,56974,47395,jos fernandez del toro,jos fernandez del toro,Male,Male,1663,1663,1733,1733,2
39,56975,47396,juan antonio llorente,juan antonio llorente,Male,Male,1750,1750,1823,1823,2
40,54053,54073,adrien marie le jendre,adrien marie legendre,Male,Male,1752,1752,1833,1833,2
41,243,47426,michele mazarino,michele mazzarini,Male,Male,1605,1605,1648,1648,2


### 3/ Every thing match exactly except the name (levenshtein = 3). Names with "?" are removed

In [8]:
level = 3

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.gender = p2.gender and p1.birth_year = p2.birth_year and p1.death_year = p2.death_year and levenshtein(p1.name, p2.name) <= 3",
    f"p1.name not like '?%' and p2.name not like '?%'"
)

u.infos(matches[matches['level'] == level])

Shape:  (10, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
44,17735,17775,pierre moine,pierre marie,Male,Male,1632,1632,1632,1632,3
45,19984,19987,charles merlan,charles martin,Male,Male,1635,1635,1636,1636,3
46,1514,56122,domingo soto,domingo de soto,Male,Male,1494,1494,1560,1560,3
47,52684,444,hermann boorhave,herman boerhaave,Male,Male,1668,1668,1738,1738,3
48,21816,20961,johann andras de scheidlin,johann andras scheidlin,Male,Male,1734,1734,1756,1756,3


### 4/ Birth dates and names match exactly

In [9]:
level = 4

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 0",
)

u.infos(matches[matches['level'] == level])

Shape:  (93, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
54,15062,18728,jean jacques huguedobler,jean jacques huguedobler,Male,Male,1660,1660,,,4
55,14695,14700,jean rodolphe frey,jean rodolphe frey,Male,Male,1662,1662,,,4
56,14021,14022,gdon brutel,gdon brutel,Male,Male,1665,1665,1668.0,,4
57,14023,14024,jacques brutel,jacques brutel,Male,Male,1666,1666,,,4
58,14915,17320,jean grevoullet,jean grevoullet,Male,Male,1633,1633,1681.0,,4


### 5/ Birth dates match exactly, names: levenshtein = 1

In [10]:
level = 5

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year = p2.birth_year and levenshtein(p1.name, p2.name) <= 1",
)

u.infos(matches[matches['level'] == level])

Shape:  (31, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
147,13828,15045,susanne berard,susanne beraud,Female,Female,1671,1671,,,5
148,13850,14052,marie bernaud,marie beraud,Female,Female,1646,1646,,,5
149,15417,15420,pierre maille,pierre maillet,Male,Male,1680,1680,,,5
150,8601,6477,joseph jaillet,joseph paillet,Male,Male,1884,1884,,,5
151,13760,13852,franoise barnaud,franoise bernaud,Female,Female,1680,1680,,1681.0,5


### 6/ Birth dates match at +/- 1 years, names match exactly

In [11]:
level = 6

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year - 1 <= p2.birth_year and p2.birth_year <= p1.birth_year + 1 and levenshtein(p1.name, p2.name) <= 0",
)

u.infos(matches[matches['level'] == level])

Shape:  (64, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
178,38975,5518,paul gauthier,paul gauthier,Male,Male,1894,1893,1937.0,,6
179,10679,11026,jean martin,jean martin,Male,Male,1884,1883,,,6
180,9957,9413,louis gindre,louis gindre,Male,Male,1868,1867,1949.0,1902.0,6
181,4899,56126,alphonse dupasquier,alphonse dupasquier,Male,Male,1794,1793,,1848.0,6
182,4545,22495,paul flix brolemann,paul flix brolemann,Male,Male,1784,1783,1858.0,,6


### 7/ Birth dates match at +/- 1 years, names: levenshtein = 1

In [12]:
level = 7

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year - 1 <= p2.birth_year and p2.birth_year <= p1.birth_year + 1 and levenshtein(p1.name, p2.name) <= 1",
)

u.infos(matches[matches['level'] == level])

Shape:  (28, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
242,9438,5485,marcel rodier,marcel rozier,Male,Male,1915,1914,,,7
243,57937,7438,maurice dlage,maurice delage,Male,Male,1907,1906,1931.0,,7
244,6768,32343,marcel mounier,marcel meunier,Male,Male,1894,1893,,1971.0,7
245,10676,8885,henry martin,henri martin,Male,Male,1893,1892,,,7
246,45660,45654,germaine hangest,germain hangest,Female,Male,1884,1883,1986.0,1963.0,7


### 8/ Birth dates match at +/- 2 years, names: levenshtein = 0

In [13]:
level = 8

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year - 2 <= p2.birth_year and p2.birth_year <= p1.birth_year + 2 and levenshtein(p1.name, p2.name) <= 0",
)

u.infos(matches[matches['level'] == level])

Shape:  (91, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
270,7527,21939,henri faure,henri faure,Male,Male,1905,1903,,,8
271,21939,8369,henri faure,henri faure,Male,Male,1903,1901,,,8
272,6924,6972,georges blanc,georges blanc,Male,Male,1902,1900,,,8
273,8102,9448,yvonne cottin,yvonne cottin,Female,Female,1899,1897,,,8
274,7421,9095,pierre reynaud,pierre reynaud,Male,Male,1879,1877,,,8


### 9/ Birth dates match at +/- 2 years, names: levenshtein = 1

In [14]:
level = 9

matches = find_dupplicated(
    matches,
    f"{level} as level",
    "p1.birth_year - 2 <= p2.birth_year and p2.birth_year <= p1.birth_year + 2 and levenshtein(p1.name, p2.name) <= 1",
)

u.infos(matches[matches['level'] == level])

Shape:  (29, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
361,10026,32065,philippe cotte,philippe coste,Male,Male,1906,1904,,1974.0,9
362,9250,10639,paul delay,paul delaye,Male,Male,1885,1883,,,9
363,21932,62540,ren demay,ren debay,Male,Male,1880,1878,1963.0,,9
364,9593,6250,charles gauthier,charles gautier,Male,Male,1876,1874,,,9
365,61835,38761,paul auban,paul alban,Male,Male,1869,1867,1945.0,1936.0,9


## Save

In [15]:
u.infos(matches)
u.save_df(matches, '../../data/record-linkage_bhp-tradi.csv')

Shape:  (390, 11)


Unnamed: 0,pk_l,pk_r,name_l,name_r,gender_l,gender_r,birth_year_l,birth_year_r,death_year_l,death_year_r,level
0,15155,15156,jean juvenel,jean juvenel,Male,Male,1643,1643,1675.0,1675.0,0
1,20092,20093,? delort,? delort,Male,Male,1641,1641,1641.0,1641.0,0
2,2886,49782,pietro duodo,pietro duodo,Male,Male,1554,1554,1610.0,1610.0,0
3,13313,22099,jean-jacques dobler,jean-jacques dobler,Male,Male,1749,1749,1838.0,1838.0,0
4,51744,51736,francesco roncalli parolino,francesco roncalli parolino,Male,Male,1692,1692,1763.0,1763.0,0
