In [1]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'bhp-date-reliability'
import_manner = 'one-shot'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# Birth and death date complements (date reliability)

cf [GitHub Issue "Import actors - Fields"](https://github.com/geovistory/symogih/issues/6)

## Fetch Geovistory infos

In [2]:
db.connect_geovistory(env, pk_project, execute, skip_protection=True)

persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv,
        a3.string as uri,
        s4.fk_subject_info as pk_birth,
        s5.fk_subject_info as pk_death
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    -- Birth
    left join information.statement s4 on s4.fk_object_info = r.pk_entity and s4.fk_property = {pks.properties.birth_broughtIntoLife_person}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    -- Death
    left join information.statement s5 on s5.fk_object_info = r.pk_entity and s5.fk_property = {pks.properties.death_wasDeathOf_person}
    inner join projects.info_proj_rel ipr5 on ipr5.fk_entity = s5.pk_entity and ipr5.fk_project = {pk_project} and ipr5.is_in_project = true
    where r.fk_class = {pks.classes.person}
""")
persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)

persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
persons = persons[['pk_bhp', 'pk_gv', 'pk_birth', 'pk_death']].drop_duplicates()

a.infos(persons)
db.disconnect()

# 14s

[DB] Connecting to PRODUCTION Database ... Connected!
Shape:  (59655, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,pk_birth,pk_death
0,1,6532778,7489054,7726515
1,2,6499432,7489055,7726516
2,3,783602,806773,7726517
3,4,6509333,7489056,7726518
4,5,6511070,7489057,7726519


[DB] Database correctly disconnected.


## Fetch BHP infos

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

births_deaths_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_type_role,
        id.certainty_date
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 40 or ir.fk_type_role = 45
""")

births_deaths_bhp = births_deaths_bhp[pd.notna(births_deaths_bhp['pk_bhp'])]  
births_deaths_bhp = births_deaths_bhp[births_deaths_bhp['pk_bhp'].str.contains('Actr')]
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].astype(pd.Int64Dtype())

a.infos(births_deaths_bhp)
db.disconnect()

# 1s

[DB] Connecting to PGSQL Database ... Connected!
Shape:  (28195, 3) - extract:


Unnamed: 0,pk_bhp,fk_type_role,certainty_date
0,44082,40,1
1,42382,45,1
2,4380,40,1
3,43805,40,1
4,44135,45,1


[DB] Database correctly disconnected.


## Merge infos

In [6]:
themerge = persons.merge(births_deaths_bhp)

# Filter out certain dates
themerge = themerge[themerge['certainty_date'] != 1]
themerge['pk_teen'] = [row['pk_birth'] if row['fk_type_role'] == 40 else row['pk_death'] for _,row in themerge.iterrows()]
themerge['comment'] = ['Date postulée' if row['certainty_date'] == 2 else 'Date reconstituée' for _,row in themerge.iterrows()]
a.infos(themerge, random=True)

table = themerge[['pk_teen', 'comment']].copy()

Shape:  (3394, 8) - extract:


Unnamed: 0,pk_bhp,pk_gv,pk_birth,pk_death,fk_type_role,certainty_date,pk_teen,comment
25132,59735,6539201,7543578,7782166,40,3,7543578,Date reconstituée
22070,50714,6503668,7535448,7773246,40,2,7535448,Date postulée
25795,60323,6537192,7544164,7782753,40,2,7544164,Date postulée
19153,44564,6497784,7529707,7767485,40,2,7529707,Date postulée
28039,63303,6543897,7547128,7785734,40,2,7547128,Date postulée


## Create information

In [5]:
db.connect_geovistory(env, pk_project, execute)

[DB] Connecting to PRODUCTION Database ... Connected!


In [7]:
pk_intance_comment_type = 7953586

table['pk_comment'] = db.resources.create(pks.classes.text, len(table))
table['pk_appe'] = db.appellations.create(table['comment'])

db.statements.create(table['pk_teen'], pks.properties.entity_hasComment_text, table['pk_comment'])
db.statements.create(table['pk_comment'], pks.properties.text_hasTextType_textType, pk_intance_comment_type)
db.statements.create(table['pk_comment'], pks.properties.text_hasValueVersion_string, table['pk_appe'])

Creating 3394 resources of class [785] ... Done in [00h00'01]
Creating info_proj_rel of 3394 entities with project <6857901> ... Done in [00h00'02]
Creating 3394 appellations ... Done in [00h00'01]
Creating info_proj_rel of 3394 entities with project <6857901> ... Done in [00h00'02]
Creating 3394 statements ... Done in [00h00'03]
Creating info_proj_rel of 3394 entities with project <6857901> ... Done in [00h00'02]
Creating 3394 statements ... Done in [00h00'03]
Creating info_proj_rel of 3394 entities with project <6857901> ... Done in [00h00'01]
Creating 3394 statements ... Done in [00h00'03]
Creating info_proj_rel of 3394 entities with project <6857901> ... Done in [00h00'02]
