In [1]:
%load_ext autoreload
%autoreload 2

env = 'production'
pk_project = 373987
execute = False

import os
import pandas as pd
import numpy as np
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()
db.connect_geovistory(env, pk_project, execute, skip_protection=True)

Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!


# BHP actors / Geovistory persons record linkage

Now that the record linkage inside the BHP has been done, and interpretated, we can proceed to record linkage against Geovistory data.

## Fetch BHP actors

Work is already done here, we just need to fetch data.

In [2]:
bhp_actors = u.read_df('../../../data/prepared/bhp_actors_cleaned.csv')
bhp_actors.columns = ['pk_bhp', 'name_bhp', 'definition_bhp', 'definition_lang_bhp', 'gender_bhp', 'birth_year_bhp', 'death_year_bhp']

a.infos(bhp_actors)

Shape:  (69837, 7) - extract:


Unnamed: 0,pk_bhp,name_bhp,definition_bhp,definition_lang_bhp,gender_bhp,birth_year_bhp,death_year_bhp
0,44895,antoine sainte-marie perrin,,,Male,,
1,47015,,,,Male,1506.0,
2,47190,alberto duimio,,,Male,1510.0,1564.0
3,47190,albertus divini,,,Male,1510.0,1564.0
4,47578,angelo zampa,,,Male,,1575.0


## Fetch Geovistory persons

In [3]:
# Here we want to fetch data from production, there is no need to ask staging
if env != 'production': db.connect_geovistory('production', pk_project, execute, skip_protection=True)

gv_persons = graphs.get_all_persons()
gv_persons.columns = ['pk_gv', 'name_gv', 'gender_gv', 'birthdate_gv', 'deathdate_gv']

# And we make sure it is again to the right environment
if env != 'production': db.connect_geovistory(env, pk_project, execute, skip_protection=True)

Fetching all Geovistory persons ... 154073 information fetched.


## URIs analysis

### Fetch BHP URIs

In [4]:
# BHP
db.connect_external(os.environ.get('YELLOW_BHP'))
uris_bhp = db.query('select * from bhp.documentation')
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documented_object'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documenting_entity'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['identifier'])]
uris_bhp = uris_bhp[uris_bhp['fk_documented_object'].str.contains('Actr')]
uris_bhp = uris_bhp[uris_bhp['fk_documenting_entity'].str.contains('DiOb')]
uris_bhp['fk_documented_object'] = uris_bhp['fk_documented_object'].str.replace('Actr', '')
uris_bhp = uris_bhp[['fk_documented_object', 'fk_documenting_entity', 'identifier']]
uris_bhp['fk_documenting_entity'] = uris_bhp['fk_documenting_entity'].str.replace('DiOb', '')
u.parse_df(uris_bhp)
resource_address_concat = u.parse_df(db.query('select * from bhp.resource_address_concatenation')[['fk_digital_object', 'fk_resource_address']])
resource_address = u.parse_df(db.query('select * from bhp.resource_address')[['pk_resource_address', 'uri']])
uris_bhp = uris_bhp.merge(resource_address_concat, left_on='fk_documenting_entity', right_on='fk_digital_object', how='left').drop(columns=['fk_documenting_entity', 'fk_digital_object'])
uris_bhp = uris_bhp.merge(resource_address, left_on='fk_resource_address', right_on='pk_resource_address', how='left')
uris_bhp['uri'] = uris_bhp['uri'] + uris_bhp['identifier']
uris_bhp = uris_bhp[['fk_documented_object', 'uri']]
uris_bhp.dropna(subset=['uri'], inplace=True)
uris_bhp.rename(columns={'fk_documented_object': 'pk_bhp', 'uri':'uri_bhp'}, inplace=True)

db.connect_geovistory(env, pk_project, execute, skip_protection=True)

>> Connecting to PGSQL Database ... Connected!
Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!


### URIs provenance analysis - BHP

In [5]:
uris_bhp['namespace_bhp'] = uris_bhp['uri_bhp'].str.replace('http://', '', regex=False).str.replace('https://', '', regex=False).str.replace('www.', '', regex=False)
uris_bhp['namespace_bhp'] = [s[0:s.index('/')] for s in uris_bhp['namespace_bhp']]

# fig = px.pie(uris_bhp, 'namespace_bhp', title=f'Distribution des namespaces des URIs des acteurs de la BHP (total {len(uris_bhp)})')
# fig.update_traces(textinfo='percent+label')
# fig.update(layout_showlegend=False)
# fig.show()

print('Namespace distribution')
total = len(uris_bhp)
counts = uris_bhp.groupby('namespace_bhp').count().sort_values('pk_bhp', ascending=False)['pk_bhp']
for (idx, value) in counts.iteritems():
    print(f"{u.percent(value/total)} - {str(value).rjust(4)} : {idx} ")

Namespace distribution
 82.93% - 5131 : d-nb.info 
 15.06% -  932 : idref.fr 
  1.02% -   63 : data.bnf.fr 
  0.32% -   20 : viaf.org 
  0.27% -   17 : wikidata.org 
  0.24% -   15 : dbpedia.org 
  0.13% -    8 : thesaurus.cerl.org 
  0.02% -    1 : it.dbpedia.org 


### Add URIs to main BHP dataframe

In [6]:
bhp_actors = bhp_actors.merge(uris_bhp, on='pk_bhp', how='left')

In [7]:
print('BHP Actors total number:', len(bhp_actors['pk_bhp'].unique()))
print('BHP Actors with at list one URI:', len(bhp_actors[pd.notna(bhp_actors['uri_bhp'])]['pk_bhp'].unique()))

BHP Actors total number: 59035
BHP Actors with at list one URI: 6079


### Fetch Geovistory URIs

In [8]:
# Here we want to fetch data from production, there is no need to ask staging
if env != 'production': db.connect_geovistory('production', pk_project, execute, skip_protection=True)

uris_gv = db.query(f"""
    select
        r2.pk_entity as pk_gv,
        a.string as uri_gv
    from information.resource r1
    inner join information.statement s1 on s1.fk_subject_info = r1.pk_entity and s1.fk_property = {pks.properties.appe_hasValue_string}
    inner join information.appellation a on a.pk_entity = s1.fk_object_info
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join information.resource r2 on r2.pk_entity = s2.fk_subject_info and r2.fk_class = {pks.classes.person}
    where r1.fk_class = {pks.classes.uri}
""")

# And we make sure it is again to the right environment
if env != 'production': db.connect_geovistory(env, pk_project, execute, skip_protection=True)

### URIs provenance analysis - Geovistory

In [9]:
uris_gv['namespace_gv'] = uris_gv['uri_gv'].str.replace('http://', '', regex=False).str.replace('https://', '', regex=False).str.replace('www.', '', regex=False)
uris_gv['namespace_gv'] = [s[0:s.index('/')] if '/' in s else s for s in uris_gv['namespace_gv']]

# fig = px.pie(uris_gv, 'namespace', title=f'Distribution des namespaces des URIs des personnes de Geovistory (total {len(uris_gv)})')
# fig.update_traces(textinfo='percent+label')
# fig.update(layout_showlegend=False)
# fig.show()

# uris_gv.drop(columns=['namespace'], inplace=True)

total = len(uris_gv)
counts = uris_gv.groupby('namespace_gv').count().sort_values('pk_gv', ascending=False)['pk_gv']
for (idx, value) in counts.iteritems():
    print(f"{u.percent(value/total)} - {str(value).rjust(4)} : {idx} ")

 27.06% - 3794 : data.bnf.fr 
 26.39% - 3701 : wikidata.org 
 24.39% - 3420 : d-nb.info 
 22.10% - 3099 : dbpedia.org 
  0.04% -    6 : idref.fr 
  0.01% -    2 : viaf.org 
  0.01% -    1 : digihum.de 


### Add URIs to main GV dataframe

In [10]:
gv_persons = gv_persons.merge(uris_gv, on='pk_gv', how='left')

In [11]:
print('Geovistory Persons total number:', len(gv_persons['pk_gv'].unique()))
print('Geovistory Persons with at list a URI:', len(gv_persons[pd.notna(gv_persons['uri_gv'])]['pk_gv'].unique()))

Geovistory Persons total number: 121246
Geovistory Persons with at list a URI: 5268


## Record linkage against Geovistory data

### First identify the ones that have the same URIs

In [12]:
uri_matched = rl.find_persons_by_uri(bhp_actors['uri_bhp'].dropna().unique())
uri_matched = uri_matched.merge(bhp_actors, left_on='uri', right_on='uri_bhp', how='left')[['pk_entity', 'pk_bhp', 'name_bhp']].drop_duplicates()
uri_matched.columns = ['pk_gv', 'pk_bhp', 'name']

print(f'{len(uri_matched)} exact matchings found!')

961 exact matchings found!


In [13]:
a.infos(uri_matched, random=True)

Shape:  (961, 3) - extract:


Unnamed: 0,pk_gv,pk_bhp,name
1779,787090,52279,johannes walters viringus
1598,787127,53962,franois verdeil
1571,786590,53360,guillaume rondelet
1532,786391,52426,johannes isaacus pontanus
1555,786517,396,georg joachim rheticus


In [14]:
u.save_df(uri_matched, '../../../data/record-linkage-bhp-actors-geov-person-uris.csv')

For those entities, we would not need to recreate them: only to add information, and a external URI to SYMOGIH.

So for the record linkage we can simply remove them.

In [42]:
print('Size before:', len(bhp_actors))
bhp_actors = bhp_actors[~bhp_actors['pk_bhp'].isin(uri_matched['pk_bhp'].tolist())]
print('Size after:', len(bhp_actors))

Size before: 70009
Size after: 68884


### Second: Prepare data to work with library

In [43]:
bhp_actors.rename(columns={'birth_year_bhp': 'birthdate', 'death_year_bhp':'deathdate', 'name_bhp': 'name', 'gender_bhp': 'gender'}, inplace=True)
bhp_actors['birthdate'] = [u.parse_date(str(year) + '-01-01') for year in bhp_actors['birthdate']]
bhp_actors['deathdate'] = [u.parse_date(str(year) + '-01-01') for year in bhp_actors['deathdate']]

a.infos(bhp_actors)

Shape:  (68884, 9) - extract:


Unnamed: 0,pk_bhp,name,definition_bhp,definition_lang_bhp,gender,birthdate,deathdate,uri_bhp,namespace_bhp
0,44895,antoine sainte-marie perrin,,,Male,,,,
1,47015,,,,Male,"(1506, 1, 1)",,,
2,47190,alberto duimio,,,Male,"(1510, 1, 1)","(1564, 1, 1)",,
3,47190,albertus divini,,,Male,"(1510, 1, 1)","(1564, 1, 1)",,
4,47578,angelo zampa,,,Male,,"(1575, 1, 1)",,


In [44]:
matchings = rl.find_persons(bhp_actors, 'pk_bhp', jobs=13, dist_days=365)

Checking data integrity... Done
Find all persons in Geovistory... 154064 found.
Finding similar persons is done - Elapsed: [00h10'03]                   


In [45]:
a.infos(matchings, random=True)

Shape:  (616, 10) - extract:


Unnamed: 0,pk_bhp,pk_entity,new_name,gv_name,new_gender,gv_gender,new_birthdate,gv_birthdate,new_deathdate,gv_deathdate
244,52296,787075,johan van heurne,johan van heurne,Male,,"(1543, 1, 1)","(1543, 2, 4)","(1601, 1, 1)",
419,6452,784831,louis fabre,louis fabre,Male,,,"(1710, 1, 1)",,
553,27452,787301,bartolomeo zamberti,bartolomeo zamberti,Male,,"(1473, 1, 1)","(1473, 1, 1)","(1543, 1, 1)",
240,25155,786530,jean ficher,jean richer,Male,,,"(1630, 1, 1)",,
122,48018,786243,paolo paruta,paolo paruta,Male,,"(1540, 1, 1)","(1540, 5, 14)","(1598, 1, 1)",


### Third: join information to help record linkage

In [46]:
matchings.rename(columns={
    'pk_entity': 'pk_gv', 
    'new_name':'bhp_name', 
    'new_gender':'bhp_gender', 
    'new_birthdate':'bhp_birthdate', 
    'new_deathdate':'bhp_deathdate'
}, inplace=True)

matchings['bhp_birthyear'] = [date[0] if pd.notna(date) else pd.NA for date in matchings['bhp_birthdate']]
matchings['gv_birthyear'] = [date[0] if pd.notna(date) else pd.NA for date in matchings['gv_birthdate']]
matchings['bhp_deathyear'] = [date[0] if pd.notna(date) else pd.NA for date in matchings['bhp_deathdate']]
matchings['gv_deathyear'] = [date[0] if pd.notna(date) else pd.NA for date in matchings['gv_deathdate']]

matchings.drop(columns=['bhp_birthdate', 'gv_birthdate', 'bhp_deathdate', 'gv_deathdate'], inplace=True)

#### Definition information

In [47]:
# BHP
actor_text_property = u.read_df('../../../data/bhp/actor_text_property.csv')[['fk_actor', 'text']]
actor_text_property.rename(columns={'text': 'bhp_definition'}, inplace=True)

# Geovistory
pks_entity = "(" + ",".join(matchings['pk_gv'].astype(str).tolist()) + ")"
gv_def = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_definition
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1762
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = 1864
    inner join information.appellation a on a.pk_entity = s2.fk_object_info
    where r.pk_entity in {pks_entity}
""")

matchings = matchings.merge(actor_text_property, left_on='pk_bhp', right_on='fk_actor', how='left').drop(columns=['fk_actor'])
matchings = matchings.merge(gv_def, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])

#### Make the dataframe unique on tuple (`pk_bhp`, `pk_gv`) so that it is easier to read.

In [49]:
matchings['key'] = matchings['pk_bhp'] + matchings['pk_gv']

matchings.fillna('', inplace=True)
for col in matchings.columns:
    matchings[col] = matchings[col].astype(str)

matchings = matchings.groupby('key').agg(
    pk_bhp=pd.NamedAgg(column='pk_bhp', aggfunc='first'),
    pk_gv=pd.NamedAgg(column='pk_gv', aggfunc='first'),
    bhp_name=pd.NamedAgg(column='bhp_name', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    gv_name=pd.NamedAgg(column='gv_name', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    bhp_gender=pd.NamedAgg(column='bhp_gender', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    gv_gender=pd.NamedAgg(column='gv_gender', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    bhp_birthyear=pd.NamedAgg(column='bhp_birthyear', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    gv_birthyear=pd.NamedAgg(column='gv_birthyear', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    bhp_deathyear=pd.NamedAgg(column='bhp_deathyear', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    gv_deathyear=pd.NamedAgg(column='gv_deathyear', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    bhp_definition=pd.NamedAgg(column='bhp_definition', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
    gv_definition=pd.NamedAgg(column='gv_definition', aggfunc=lambda x: ' - '.join(str(e) for e in np.unique(x))),
)

matchings.reset_index(drop=True, inplace=True)

a.infos(matchings, random=True)

Shape:  (591, 12) - extract:


Unnamed: 0,pk_bhp,pk_gv,bhp_name,gv_name,bhp_gender,gv_gender,bhp_birthyear,gv_birthyear,bhp_deathyear,gv_deathyear,bhp_definition,gv_definition
207,26499,784068,jean bouillet,jean bouillet,Male,,,1690.0,,,,Médecin. Docteur de l'université de Montpellie...
307,34756,91662,joseph muller,joseph mulder,Male,,,,,,pre de Marie,
522,55820,27254,nicolas fatio de duillier,nicolas fatio de duillier,Male,,1664.0,1664.0,1753.0,,Astronom - Mathmaticien suisse.,
347,38202,784282,john case,john case,Male,,1539.0,1539.0,1600.0,,"*Woodstock (Oxfordshire) v. 1539-1546, Oxford ...",Philosophe
299,34435,86356,jean bonnet,jan bonnet,Male,,1866.0,,1925.0,,"Diplm des Arts et Mtiers (Aix, promotion 1881)...",


In [66]:
matchings['pk_bhp'] = matchings['pk_bhp'].astype(int)
matchings['pk_gv'] = matchings['pk_gv'].astype(int)
matchings['doublon'] = ['oui' if (row['bhp_birthyear'] == row['gv_birthyear'] and row['bhp_birthyear'] != '') else '' for _, row in matchings.iterrows()]
matchings.sort_values(by=['doublon', 'pk_bhp'], inplace=True)

a.infos(matchings)

Shape:  (591, 13) - extract:


Unnamed: 0,pk_bhp,pk_gv,bhp_name,gv_name,bhp_gender,gv_gender,bhp_birthyear,gv_birthyear,bhp_deathyear,gv_deathyear,bhp_definition,gv_definition,doublon
6,113,784435,christophe colomb,christophe colomb,Male,,1451,1450,1506,,"Navigateur gnois, dcouvreur du Nouveau Monde","Navigateur, découvreur de l'Amérique. Au servi...",
75,190,785393,jeremiah horrocks,jeremiah horrocks,Male,,1618,1617,1641,,,Astronome,
250,296,786292,antonio persio,antonio persio,Male,,1542,1543,1612,,,Prêtre. Philosophe,
354,385,25503,huldrych zwingli,huldrych zwingli,Male,,1484,- 1484,1531,1531.0,"Cur de Glaris, d'Einsiedeln et, partir de 151...",,
465,501,783829,sebastiano bartoli,sebastiano bartoli,Male,,1629,1630,1676,,"/ Medico e scienziato, fu professore di anatom...",Médecin et professeur de médecine à Naples,


## Save record linkage

In [67]:
u.save_df(matchings.sort_values('pk_bhp'), '../../../data/record-linkage-bhp-actors-geov-persons.csv')