In [1]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'fix-missing-uri'
import_manner = 'one-shot'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()


# Fix missing URIs in some temporal entities

### Fetch all URIs

In [2]:
db.connect_external(os.environ.get('YELLOW_BHP'))

births_deaths_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_information as fk_info,
        ir.fk_type_role
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 40 or ir.fk_type_role = 45
""")

births_deaths_bhp = births_deaths_bhp[pd.notna(births_deaths_bhp['pk_bhp'])]  
births_deaths_bhp = births_deaths_bhp[births_deaths_bhp['pk_bhp'].str.contains('Actr')]
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].astype(pd.Int64Dtype())
births_deaths_bhp['uri_evt'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in births_deaths_bhp['fk_info']]
births_deaths_bhp['teen'] = ['birth' if role == 40 else 'death' for role in births_deaths_bhp.fk_type_role]
births_deaths_bhp = births_deaths_bhp[['pk_bhp', 'teen', 'uri_evt']]

print('Information nb:', len(births_deaths_bhp))

db.disconnect()

# 1s

[DB] Connecting to PGSQL Database ... Connected!
Information nb: 28195
[DB] Database correctly disconnected.


### Get Geovistory data

In [3]:
db.connect_geovistory(env, pk_project, execute, skip_protection=True)

persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv,
        a3.string as uri,
        s4.fk_subject_info as pk_birth,
        a4.string as birth_uri,
        s5.fk_subject_info as pk_death,
        a5.string as death_uri
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    -- Birth
    left join information.statement s4 on s4.fk_object_info = r.pk_entity and s4.fk_property = {pks.properties.birth_broughtIntoLife_person}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    left join information.statement s4b on s4b.fk_subject_info = s4.fk_subject_info and s4b.fk_property = {pks.properties.entity_sameAsURI_URI}
    left join information.statement s4c on s4c.fk_subject_info = s4b.fk_object_info and s4c.fk_property = {pks.properties.appe_hasValue_string}
    left join information.appellation a4 on a4.pk_entity = s4c.fk_object_info
    -- Death
    left join information.statement s5 on s5.fk_object_info = r.pk_entity and s5.fk_property = {pks.properties.death_wasDeathOf_person}
    inner join projects.info_proj_rel ipr5 on ipr5.fk_entity = s5.pk_entity and ipr5.fk_project = {pk_project} and ipr5.is_in_project = true
    left join information.statement s5b on s5b.fk_subject_info = s5.fk_subject_info and s5b.fk_property = {pks.properties.entity_sameAsURI_URI}
    left join information.statement s5c on s5c.fk_subject_info = s5b.fk_object_info and s5c.fk_property = {pks.properties.appe_hasValue_string}
    left join information.appellation a5 on a5.pk_entity = s5c.fk_object_info

    where r.fk_class = {pks.classes.person}
""")
persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)

persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
# persons = persons[['pk_bhp', 'pk_gv', 'pk_birth',  'pk_death']].drop_duplicates()

db.disconnect()

# 12s

[DB] Connecting to PRODUCTION Database ... Connected!
[DB] Database correctly disconnected.


### Get wrong data

In [4]:
table = births_deaths_bhp.merge(persons)
table['pk_teen'] = [row['pk_birth'] if row['teen'] == 'birth' else row['pk_death'] for _,row in table.iterrows()]
table['teen_uri'] = [row['birth_uri'] if row['teen'] == 'birth' else row['death_uri'] for _,row in table.iterrows()]
# table = table[['pk_bhp', 'pk_teen', 'teen_uri', 'uri_evt']]
table.rename(columns={'uri_evt': 'uri_should', 'teen_uri': 'uri_has'}, inplace=True)
table = table[table['uri_has'] != table['uri_should']].drop_duplicates()
# table = table[table['teen_uri'] != table['uri_evt']].drop_duplicates()

In [5]:
pbs = []

for pk in table.pk_bhp.unique():
    
    # birth
    selection = table[(table['teen'] == 'birth') & (table['pk_bhp'] == pk)]
    uri_should = np.unique(selection['uri_should'].tolist())
    uri_has = np.unique(selection['uri_has'].dropna().tolist())
    if(len(uri_should) == 0): pass
    elif (len(uri_has) == 0): pbs.append({'pk_bhp':pk, 'teen':'birth'})
    else: 
        for uri in uri_should: 
            if uri not in uri_has: pbs.append({'pk_bhp':pk, 'teen':'birth'})

    # death
    selection = table[(table['teen'] == 'death') & (table['pk_bhp'] == pk)]
    uri_should = np.unique(selection['uri_should'].tolist())
    uri_has = np.unique(selection['uri_has'].dropna().tolist())
    if(len(uri_should) == 0): pass
    elif (len(uri_has) == 0): pbs.append({'pk_bhp':pk, 'teen':'death'})
    else: 
        for uri in uri_should: 
            if uri not in uri_has: pbs.append({'pk_bhp':pk, 'teen':'death'})

indexes = []
for pb in pbs: indexes += table[(table['pk_bhp'] == pb['pk_bhp']) & (table['teen'] == pb['teen'])].index.tolist()

pbs = table.loc[indexes]
pbs = pbs[pd.isna(pbs['uri_has'])]

### Create corrections

In [6]:
db.connect_geovistory(env, pk_project, execute)

graphs.add_uris(pbs['pk_teen'], pbs['uri_should'])

[DB] Connecting to PRODUCTION Database ... Connected!
Creating 330 resources of class [967] ... Done in [00h00'00]
Creating info_proj_rel of 330 entities with project <6857901> ... Done in [00h00'01]
Creating 330 appellations ... Done in [00h00'00]
Creating info_proj_rel of 330 entities with project <6857901> ... Done in [00h00'00]
Creating 330 statements ... Done in [00h00'01]
Creating info_proj_rel of 330 entities with project <6857901> ... Done in [00h00'00]
Creating 330 statements ... Done in [00h00'01]
Creating info_proj_rel of 330 entities with project <6857901> ... Done in [00h00'00]
