In [1]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'bhp-actors-deaths'
import_manner = 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

>> Connecting to PRODUCTION Database ... Connected!


# Import BHP actors' death into Geovistory

## Fetch data

### Junction table: pk BHP <=> pk Geovistory

In [2]:
db.connect_geovistory('prod', pk_project, execute, skip_protection=True)

persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv
        ,a3.string as uri
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    where r.fk_class = {pks.classes.person}
""")
db.disconnect()

persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)
persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
persons = persons[['pk_bhp', 'pk_gv']]

a.infos(persons)

# 10s

>> Connecting to PRODUCTION Database ... Connected!
Database correctly disconnected.
Shape:  (59526, 2) - extract:


Unnamed: 0,pk_bhp,pk_gv
0,1,6532778
1,2,6499432
2,3,783602
3,4,6509333
4,5,6511070


### Add infos from BHP - Full deaths

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

actor_list = persons.pk_bhp.tolist()
actor_list = [('Actr' + str(pk)) for pk in actor_list]


real_death_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_information as fk_info,
        id.year, id.month, id.day,
        id.fk_abob_type_information_date
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 45
""")
real_death_bhp = real_death_bhp[real_death_bhp['pk_bhp'].str.contains('Actr')]
real_death_bhp['pk_bhp'] = real_death_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
real_death_bhp['pk_bhp'] = real_death_bhp['pk_bhp'].astype(pd.Int64Dtype())
real_death_bhp['year'] = real_death_bhp['year'].astype(pd.Int64Dtype())
real_death_bhp['month'] = real_death_bhp['month'].astype(pd.Int64Dtype())
real_death_bhp['day'] = real_death_bhp['day'].astype(pd.Int64Dtype())
real_death_bhp['date_bhp'] = [(row.year, row.month, row.day) for i, row in real_death_bhp.iterrows()]
real_death_bhp['uri_death'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in real_death_bhp['fk_info']]

# For now we only import unique data ie: id.fk_abob_type_information_date == 246
real_death_bhp = real_death_bhp[real_death_bhp['fk_abob_type_information_date'] == 246]

real_death_bhp.drop(columns=['year', 'month', 'day', 'fk_info', 'fk_abob_type_information_date'], inplace=True)

a.infos(real_death_bhp)

>> Connecting to PGSQL Database ... Connected!
Shape:  (9255, 3) - extract:


Unnamed: 0,pk_bhp,date_bhp,uri_death
0,43740,"(1820, 10, 5)",http://symogih.org/resource/Info87258
1,43871,"(1830, 11, 18)",http://symogih.org/resource/Info87259
3,33850,"(1751, 10, 16)",http://symogih.org/resource/Info87377
4,33851,"(1760, 1, 11)",http://symogih.org/resource/Info87378
5,44135,"(1703, 3, 9)",http://symogih.org/resource/Info87392


In [4]:
# Merge to main table
persons = persons.merge(real_death_bhp, on='pk_bhp', how='left')

a.infos(persons, random=True)

Shape:  (59578, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death
36243,39516,6520682,,
45347,49262,6500653,,
58774,62844,6508581,,
34331,36354,6518719,,
42794,46389,6528009,,


### Add infos from BHP - Small death

In [5]:
actors = u.read_df('../../data/bhp/actor.csv')
actors = actors[['pk_actor', 'begin_year']]
actors.rename(inplace=True, columns={'pk_actor':'pk_bhp'})
actors['begin_year'] = [(year, pd.NA, pd.NA) for year in actors.begin_year]

In [6]:
# Merge to main table
persons = persons.merge(actors, on='pk_bhp', how='left')
persons['date_bhp'] = [row['date_bhp'] if pd.notna(row['date_bhp']) else row.begin_year for i, row in persons.iterrows()]
persons.drop(columns=['begin_year'], inplace=True)

a.infos(persons)

Shape:  (59578, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death
0,1,6532778,"(1599, <NA>, <NA>)",
1,2,6499432,"(1600, <NA>, <NA>)",
2,3,783602,"(1581, <NA>, <NA>)",
3,4,6509333,"(1600, <NA>, <NA>)",
4,5,6511070,"(1681, <NA>, <NA>)",


### Add existing death from geovistory

In [7]:
values = '(' + ','.join([str(pk) for pk in persons.pk_gv.tolist()]) + ')'

db.connect_geovistory('prod', pk_project, False, skip_protection=True)
gv_deaths = db.query(f"""
    select distinct
        s1.fk_object_info as pk_person
        ,s1.pk_entity as pk_stmt_person_to_death
        ,s1.fk_subject_info as pk_death
    from information.statement s1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.is_in_project = true
    left join information.statement s2 on s2.fk_subject_info = s1.fk_subject_info and s2.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    where s1.fk_object_info in {values}
      and s1.fk_property = {pks.properties.death_wasDeathOf_person}
""")

db.disconnect()
a.infos(gv_deaths)

Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!
Database correctly disconnected.
Shape:  (24, 3) - extract:


Unnamed: 0,pk_person,pk_stmt_person_to_death,pk_death
0,25503,25856,25852
1,27035,870304,870303
2,149826,239672,239671
3,786854,6130322,6130319
4,869088,869100,869099


In [8]:
persons = persons.merge(gv_deaths, left_on='pk_gv', right_on='pk_person', how='left').drop(columns=['pk_person'])
persons['uri_death'] = persons['uri_death'].astype(pd.StringDtype())
persons['pk_stmt_person_to_death'] = persons['pk_stmt_person_to_death'].astype(pd.Int64Dtype())
persons['pk_death'] = persons['pk_death'].astype(pd.Int64Dtype())

persons

Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death,pk_stmt_person_to_death,pk_death
0,1,6532778,"(1599, <NA>, <NA>)",,,
1,2,6499432,"(1600, <NA>, <NA>)",,,
2,3,783602,"(1581, <NA>, <NA>)",,,
3,4,6509333,"(1600, <NA>, <NA>)",,,
4,5,6511070,"(1681, <NA>, <NA>)",,,
...,...,...,...,...,...,...
59573,63646,6544154,"(1788, 12, 9)",http://symogih.org/resource/Info166196,,
59574,63647,6543685,"(1790, 2, <NA>)",http://symogih.org/resource/Info166200,,
59575,63648,6543708,"(1790, 2, 18)",http://symogih.org/resource/Info166204,,
59576,63649,6544155,"(1790, 9, <NA>)",http://symogih.org/resource/Info166208,,


## Import data

In [9]:
db.connect_geovistory(env, pk_project, execute)

>> Connecting to PRODUCTION Database ... Connected!


### death - Add existing to project

In [10]:
to_add = []
to_add += persons[pd.notna(persons['pk_stmt_person_to_death'])]['pk_stmt_person_to_death'].tolist()
to_add += persons[pd.notna(persons['pk_death'])]['pk_death'].tolist()
db.info_proj_rels.create(to_add)

Creating info_proj_rel of 48 entities with project <6857901> ... Done in [00h00'01]


### death - Create new

In [11]:
selection = persons[pd.isna(persons['pk_death'])][['pk_gv']].copy()
selection['pk_death_new'] = db.resources.create(pks.classes.death, len(selection))

Batch creation of 59554 entities  is done - Elapsed: [00h31'31]                   


In [12]:
persons = persons.merge(selection, on='pk_gv', how='left')
persons['pk_death'] = [row.pk_death if pd.notna(row.pk_death) else row.pk_death_new for _, row in persons.iterrows()]
persons['pk_death'] = persons['pk_death'].astype(pd.Int64Dtype())
persons.drop(columns=['pk_death_new'], inplace=True)

a.infos(persons)

Shape:  (59722, 6) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death,pk_stmt_person_to_death,pk_death
0,1,6532778,"(1599, <NA>, <NA>)",,,7726515
1,2,6499432,"(1600, <NA>, <NA>)",,,7726516
2,3,783602,"(1581, <NA>, <NA>)",,,7726517
3,4,6509333,"(1600, <NA>, <NA>)",,,7726518
4,5,6511070,"(1681, <NA>, <NA>)",,,7726519


### death - Add URI

In [13]:
selection = persons[pd.notna(persons['uri_death'])]
graphs.add_uris(selection['pk_death'].tolist(), selection['uri_death'].tolist())

Batch creation of 9388 entities  is done - Elapsed: [00h04'58]                    
Batch creation of 9388 appellations  is done - Elapsed: [00h05'03]                   
Batch creation of 9388 statements  is done - Elapsed: [00h05'01]                   
Batch creation of 9388 statements  is done - Elapsed: [00h05'03]                   


### death - Add date

In [14]:
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA


# Compute duration
selection = persons[['pk_death', 'date_bhp']].copy()
selection['duration'] = [get_duration(d) for d in selection['date_bhp']]
selection.dropna(inplace=True)

# Create Time primitive
selection['pk_time_primitives'] = db.time_primitives.create(selection['date_bhp'].tolist(), selection['duration'].tolist())

# death has time primitive
db.statements.create(
    selection['pk_death'], 
    pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, 
    selection['pk_time_primitives']
)

Batch creation of 36695 time_primitives  is done - Elapsed: [00h19'48]                   
Batch creation of 36695 statements  is done - Elapsed: [00h20'05]                   


### Death - Link death to persons

In [15]:
selection = persons[['pk_gv', 'pk_death', 'pk_stmt_person_to_death']]
selection = selection[pd.isna(selection['pk_stmt_person_to_death'])].drop(columns=['pk_stmt_person_to_death'])

db.statements.create(
    selection['pk_death'], 
    pks.properties.death_wasDeathOf_person, 
    selection['pk_gv']
)

Batch creation of 59698 statements  is done - Elapsed: [00h32'11]                   
