In [61]:
env = 'staging'
pk_project = -1
execute = False

import os
import pandas as pd

import geovpylib.utils as u
import geovpylib.database as db
import geovpylib.find as find
import geovpylib.pks as pks

db.connect(env, pk_project, execute)

Requests will not be executed
=== Setting STAGING environment ===
>> Connecting to PGSQL Database ... Connected!


# BHP actors / Geovistory persons record linkage

Now that the record linkage inside the BHP has been done, and interpretated, we can proceed to record linkage against Geovistory data.

## Fetch BHP actors

Work is already done here, we just need to fetch data.

In [62]:
bhp_actors = u.read_df('../../../data/prepared/bhp_actors_cleaned.csv')

u.infos(bhp_actors)

Shape:  (69837, 7) - extract:


Unnamed: 0,pk,name,definition,definition_lang,gender,birth_year,death_year
0,44895,antoine sainte-marie perrin,,,Male,,
1,47015,,,,Male,1506.0,
2,47190,alberto duimio,,,Male,1510.0,1564.0
3,47190,albertus divini,,,Male,1510.0,1564.0
4,47578,angelo zampa,,,Male,,1575.0


## Record linkage against Geovistory data

### Formating to work with library

In [63]:
bhp_actors.rename(columns={'pk':'pk_bhp', 'birth_year': 'birthdate', 'death_year':'deathdate'}, inplace=True)
bhp_actors['birthdate'] = [u.parse_date(str(year) + '-01-01') for year in bhp_actors['birthdate']]
bhp_actors['deathdate'] = [u.parse_date(str(year) + '-01-01') for year in bhp_actors['deathdate']]

bhp_actors

Unnamed: 0,pk_bhp,name,definition,definition_lang,gender,birthdate,deathdate
0,44895,antoine sainte-marie perrin,,,Male,,
1,47015,,,,Male,"(1506, 1, 1)",
2,47190,alberto duimio,,,Male,"(1510, 1, 1)","(1564, 1, 1)"
3,47190,albertus divini,,,Male,"(1510, 1, 1)","(1564, 1, 1)"
4,47578,angelo zampa,,,Male,,"(1575, 1, 1)"
...,...,...,...,...,...,...,...
69832,60554,louis gabriel escher,Industriel mtallurgiste n le 27 novembre 1819 ...,fra,Male,"(1819, 1, 1)","(1887, 1, 1)"
69833,60554,louis gabriel oescher,Industriel mtallurgiste. Associ Louis Charles...,fra,Male,"(1819, 1, 1)","(1887, 1, 1)"
69834,60554,louis gabriel oescher,Industriel mtallurgiste n le 27 novembre 1819 ...,fra,Male,"(1819, 1, 1)","(1887, 1, 1)"
69835,2291,toms maluenda,"*Jtiva (Valence) 1566, 7.V.1628. Clbre dominic...",fra,Male,"(1565, 1, 1)","(1628, 1, 1)"


In [64]:
matchings = find.find_persons(bhp_actors, 'pk_bhp', jobs=13, dist_days=365)

Checking data integrity... Done
Find all persons in Geovistory... 153963 found.
Finding similar persons is done - Elapsed: [00h10'01]                   


### Join information to help record linkage

In [65]:
matchings.rename(columns={'pk_entity': 'pk_gv', 'new_name':'bhp_name', 'new_gender':'bhp_gender', 'new_birthdate':'bhp_birthdate', 'new_deathdate':'bhp_deathdate'}, inplace=True)


#### Definition information

In [66]:
# BHP
actor_text_property = u.read_df('../../../data/bhp/actor_text_property.csv')[['fk_actor', 'text']]
actor_text_property.rename(columns={'text': 'bhp_definition'}, inplace=True)

# Geovistory
pks_entity = "(" + ",".join(matchings['pk_gv'].astype(str).tolist()) + ")"
gv_def = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_definition
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1762
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = 1864
    inner join information.appellation a on a.pk_entity = s2.fk_object_info
    where r.pk_entity in {pks_entity}
""")

matchings = matchings.merge(actor_text_property, left_on='pk_bhp', right_on='fk_actor', how='left').drop(columns=['fk_actor'])
matchings = matchings.merge(gv_def, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])

#### URIs

In [67]:
# BHP
u.db_connect(os.environ.get('YELLOW_BHP'))
documentation = u.db_execute('select * from bhp.documentation')
documentation = documentation[pd.notna(documentation['fk_documented_object'])]
documentation = documentation[pd.notna(documentation['fk_documenting_entity'])]
documentation = documentation[pd.notna(documentation['identifier'])]
documentation = documentation[documentation['fk_documented_object'].str.contains('Actr')]
documentation = documentation[documentation['fk_documenting_entity'].str.contains('DiOb')]
documentation['fk_documented_object'] = documentation['fk_documented_object'].str.replace('Actr', '')
documentation = documentation[['fk_documented_object', 'fk_documenting_entity', 'identifier']]
documentation['fk_documenting_entity'] = documentation['fk_documenting_entity'].str.replace('DiOb', '')
u.parse_df(documentation)
resource_address_concat = u.parse_df(u.db_execute('select * from bhp.resource_address_concatenation')[['fk_digital_object', 'fk_resource_address']])
resource_address = u.parse_df(u.db_execute('select * from bhp.resource_address')[['pk_resource_address', 'uri']])
documentation = documentation.merge(resource_address_concat, left_on='fk_documenting_entity', right_on='fk_digital_object', how='left').drop(columns=['fk_documenting_entity', 'fk_digital_object'])
documentation = documentation.merge(resource_address, left_on='fk_resource_address', right_on='pk_resource_address', how='left')
documentation['uri'] = documentation['uri'] + documentation['identifier']
documentation = documentation[['fk_documented_object', 'uri']]
documentation.dropna(subset=['uri'], inplace=True)
documentation.rename(columns={'uri':'bhp_uri'}, inplace=True)

# Geovistory
db.connect(env, pk_project, execute)
pks_entity = "(" + ",".join(matchings['pk_gv'].astype(str).tolist()) + ")"
gv_uris = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_uri
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1943
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = 1843
    inner join information.appellation a on a.pk_entity = s2.fk_object_info
    where r.pk_entity in {pks_entity}
""")

matchings = matchings.merge(documentation, left_on='pk_bhp', right_on='fk_documented_object', how='left').drop(columns=['fk_documented_object'])
matchings = matchings.merge(gv_uris, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])

>> Connecting to PGSQL Database ... Connected!
Requests will not be executed
=== Setting STAGING environment ===
>> Connecting to PGSQL Database ... Connected!


In [68]:
# matchings['bhp_uri_extract'] = matchings['bhp_uri'].str.replace('http', '')
# matchings['gv_uri_extract'] = matchings['gv_uris'].str.replace('https', '').str.replace('/id', '')

In [69]:
u.save_df(matchings.sort_values('pk_bhp'), '../../../data/record-linkage-bhp-actors-geov-persons.csv')

---

In [None]:
matchings[matchings['bhp_uri_extract'] == matchings['gv_uri_extract']].drop_duplicates(subset=['pk_bhp', 'pk_gv'])

In [None]:
matchings[matchings['bhp_name'].str.contains('josias simler')]

In [None]:
matchings[(pd.notna(matchings['gv_uris'])) & (pd.notna(matchings['bhp_uri']))]

In [None]:
matchings[pd.notna(matchings['gv_definition'])  & matchings['gv_definition'].str.contains('astronome')]