In [1]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'bhp-actors-births'
import_manner = 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()


# Birth and death date complements

cf [GitHub Issue "Import actors - Fields"](https://github.com/geovistory/symogih/issues/6)

## Fetch Geovistory data: pk BHP <=> pk Geovistory

In [2]:
db.connect_geovistory(env, pk_project, execute, skip_protection=True)

persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv,
        a3.string as uri,
        s4.fk_subject_info as pk_birth,
        --s4b.pk_entity as pk_stmt_birth_to_date,
        --tp4.julian_day as julian_day_birth,
        --tp4.calendar as calendar_date_birth,
        --ipr4b.is_in_project as iip_birth,
        s5.fk_subject_info as pk_death--,
        --s5b.pk_entity as pk_stmt_death_to_date,
        --tp5.julian_day as julian_day_death,
        --tp5.calendar as calendar_date_death,
        --ipr5b.is_in_project as iip_death
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    -- Birth
    left join information.statement s4 on s4.fk_object_info = r.pk_entity and s4.fk_property = {pks.properties.birth_broughtIntoLife_person}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    --left join information.statement s4b on s4b.fk_subject_info = s4.fk_subject_info and s4b.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    --left join projects.info_proj_rel ipr4b on ipr4b.fk_entity = s4b.pk_entity and ipr4b.fk_project = {pk_project} and ipr4b.is_in_project = true
    --left join information.time_primitive tp4 on tp4.pk_entity = s4b.fk_object_info
    -- Death
    left join information.statement s5 on s5.fk_object_info = r.pk_entity and s5.fk_property = {pks.properties.death_wasDeathOf_person}
    inner join projects.info_proj_rel ipr5 on ipr5.fk_entity = s5.pk_entity and ipr5.fk_project = {pk_project} and ipr5.is_in_project = true
    --left join information.statement s5b on s5b.fk_subject_info = s5.fk_subject_info and s5b.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    --left join projects.info_proj_rel ipr5b on ipr5b.fk_entity = s5b.pk_entity and ipr5b.fk_project = {pk_project} and ipr5b.is_in_project = true
    --left join information.time_primitive tp5 on tp5.pk_entity = s5b.fk_object_info

    where r.fk_class = {pks.classes.person}
""")
persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)

persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
persons = persons[['pk_bhp', 'pk_gv', 'pk_birth', 'pk_death']].drop_duplicates()

a.infos(persons)
db.disconnect()

# 12s

[DB] Connecting to PRODUCTION Database ... Connected!
Shape:  (59655, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,pk_birth,pk_death
0,1,6532778,7489054,7726515
1,2,6499432,7489055,7726516
2,3,783602,806773,7726517
3,4,6509333,7489056,7726518
4,5,6511070,7489057,7726519


[DB] Database correctly disconnected.


## Fetch those dates

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

births_deaths_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_information as fk_info,
        id.year, id.month, id.day,
        id.fk_abob_type_information_date,
        ir.fk_type_role
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 40 or ir.fk_type_role = 45
""")

births_deaths_bhp = births_deaths_bhp[pd.notna(births_deaths_bhp['pk_bhp'])]  
births_deaths_bhp = births_deaths_bhp[births_deaths_bhp['pk_bhp'].str.contains('Actr')]
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
births_deaths_bhp['pk_bhp'] = births_deaths_bhp['pk_bhp'].astype(pd.Int64Dtype())
births_deaths_bhp['year'] = births_deaths_bhp['year'].astype(pd.Int64Dtype())
births_deaths_bhp['month'] = births_deaths_bhp['month'].astype(pd.Int64Dtype())
births_deaths_bhp['day'] = births_deaths_bhp['day'].astype(pd.Int64Dtype())
births_deaths_bhp['date_bhp'] = [(row.year, row.month, row.day) for i, row in births_deaths_bhp.iterrows()]
births_deaths_bhp['uri_evt'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in births_deaths_bhp['fk_info']]

births_deaths_bhp = births_deaths_bhp[(births_deaths_bhp['fk_abob_type_information_date'] == 1125) | (births_deaths_bhp['fk_abob_type_information_date'] == 1126)]

print('Information nb:', len(births_deaths_bhp))

db.disconnect()

# 1s

[DB] Connecting to PGSQL Database ... Connected!
Information nb: 258
[DB] Database correctly disconnected.


## Merge data

In [4]:
table = births_deaths_bhp.merge(persons, on='pk_bhp')
table['pk_teen'] = [row['pk_birth'] if row['fk_type_role'] == 40 else row['pk_death'] for _,row in table.iterrows()]
table['pk_property'] = [pks.properties.timespan_beginOfTheBegin_timePrim if row['fk_abob_type_information_date'] == 1125 else pks.properties.timespan_endOfTheEnd_timePrim for _,row in table.iterrows()]
table['date'] = table['date_bhp']

print('Information nb (after merge):', len(table))

table = table[['pk_teen', 'pk_property', 'date', 'pk_birth', 'pk_death']]

Information nb (after merge): 258


## Create new information

In [5]:
db.connect_geovistory(env, pk_project, execute)

[DB] Connecting to PRODUCTION Database ... Connected!


**Remove existing at some time within statements from project**

In [7]:
# Find all at some time within properties that needs to be removed from project
pks_teen = table['pk_teen'].unique().tolist()
values = '(' + ','.join([str(pk) for pk in pks_teen]) + ')'
at_some_times_within = db.query(f"""
    select 
        s.pk_entity as pk_stmt, ipr.pk_entity as pk_ipr
    from information.resource r
    inner join information.statement s on s.fk_subject_info = r.pk_entity and s.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    inner join projects.info_proj_rel ipr on ipr.fk_entity = s.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    where r.pk_entity in {values}
""")

values_ipr = '(' + ','.join([str(pk) for pk in at_some_times_within['pk_ipr'].tolist()]) + ')'

db.execute(f"""
    update projects.info_proj_rel
        set is_in_project = false           
    where pk_entity in {values_ipr};
""")

table.drop(columns=['pk_birth', 'pk_death'], inplace=True)

**Create new `begin of the begin` & `end of the end` statements**

In [8]:
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA

table['duration'] = [get_duration(d) for d in table['date']]


# Create time primitives
table['pk_time_prim'] = db.time_primitives.create(table['date'], table['duration'])

# Create statements
db.statements.create(table['pk_teen'], table['pk_property'], table['pk_time_prim'])

Creating 258 time primitives ... Done in [00h00'00]
Creating info_proj_rel of 258 entities with project <6857901> ... Done in [00h00'01]
Creating 258 statements ... Done in [00h00'00]
Creating info_proj_rel of 258 entities with project <6857901> ... Done in [00h00'00]
