In [14]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'bhp-actors-deaths'
import_manner = 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
>> Connecting to PRODUCTION Database ... Connected!


# Import BHP actors' death into Geovistory

## Fetch data

### Junction table: pk BHP <=> pk Geovistory

In [2]:
db.connect_geovistory('prod', pk_project, execute, skip_protection=True)

persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv
        ,a3.string as uri
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    where r.fk_class = {pks.classes.person}
""")
db.disconnect()

persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)
persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
persons = persons[['pk_bhp', 'pk_gv']]

a.infos(persons)

# 10s

Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!
Database correctly disconnected.
Shape:  (59526, 2) - extract:


Unnamed: 0,pk_bhp,pk_gv
0,1,6532778
1,2,6499432
2,3,783602
3,4,6509333
4,5,6511070


### Add infos from BHP - Full deaths

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

actor_list = persons.pk_bhp.tolist()
actor_list = [('Actr' + str(pk)) for pk in actor_list]


real_death_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_information as fk_info,
        id.year, id.month, id.day,
        id.fk_abob_type_information_date
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 45
""")
real_death_bhp = real_death_bhp[real_death_bhp['pk_bhp'].str.contains('Actr')]
real_death_bhp['pk_bhp'] = real_death_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
real_death_bhp['pk_bhp'] = real_death_bhp['pk_bhp'].astype(pd.Int64Dtype())
real_death_bhp['year'] = real_death_bhp['year'].astype(pd.Int64Dtype())
real_death_bhp['month'] = real_death_bhp['month'].astype(pd.Int64Dtype())
real_death_bhp['day'] = real_death_bhp['day'].astype(pd.Int64Dtype())
real_death_bhp['date_bhp'] = [(row.year, row.month, row.day) for i, row in real_death_bhp.iterrows()]
real_death_bhp['uri_death'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in real_death_bhp['fk_info']]

# For now we only import unique data ie: id.fk_abob_type_information_date == 246
real_death_bhp = real_death_bhp[real_death_bhp['fk_abob_type_information_date'] == 246]

real_death_bhp.drop(columns=['year', 'month', 'day', 'fk_info', 'fk_abob_type_information_date'], inplace=True)

a.infos(real_death_bhp)

>> Connecting to PGSQL Database ... Connected!
Shape:  (9255, 3) - extract:


Unnamed: 0,pk_bhp,date_bhp,uri_death
0,43740,"(1820, 10, 5)",http://symogih.org/resource/Info87258
1,43871,"(1830, 11, 18)",http://symogih.org/resource/Info87259
3,33850,"(1751, 10, 16)",http://symogih.org/resource/Info87377
4,33851,"(1760, 1, 11)",http://symogih.org/resource/Info87378
5,44135,"(1703, 3, 9)",http://symogih.org/resource/Info87392


In [4]:
# Merge to main table
persons = persons.merge(real_death_bhp, on='pk_bhp', how='left')

a.infos(persons, random=True)

Shape:  (59578, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death
39346,42887,6495003,,
44508,48274,6542116,,
56562,60623,6511223,,
43868,47475,6532645,,
27105,28286,6513939,,


### Add infos from BHP - Small death

In [5]:
actors = u.read_df('../../data/bhp/actor.csv')
actors = actors[['pk_actor', 'end_year']]
actors.rename(inplace=True, columns={'pk_actor':'pk_bhp'})
actors['end_year'] = [(year, pd.NA, pd.NA) for year in actors['end_year']]

In [6]:
# Merge to main table
persons = persons.merge(actors, on='pk_bhp', how='left')
persons['date_bhp'] = [row['date_bhp'] if pd.notna(row['date_bhp']) else row['end_year'] for i, row in persons.iterrows()]
persons.drop(columns=['end_year'], inplace=True)

a.infos(persons)

Shape:  (59578, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_death
0,1,6532778,"(1653, <NA>, <NA>)",
1,2,6499432,"(1672, <NA>, <NA>)",
2,3,783602,"(1643, <NA>, <NA>)",
3,4,6509333,"(1635, <NA>, <NA>)",
4,5,6511070,"(1746, <NA>, <NA>)",


### Add existing death from symogih project

In [7]:
db.connect_geovistory(env, pk_project, execute, skip_protection=True)

symogih_deaths = db.query(f"""
    select
        s1.fk_object_info as pk_gv,
        r.pk_entity as pk_death,
        s2.pk_entity as pk_stmt_to_delete,
        ipr2.pk_entity as pk_ipr_to_delete,
        tp.julian_day,
        tp.calendar
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.death_wasDeathOf_person}
    left join information.statement s2 on s2.fk_subject_info = r.pk_entity and s2.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    left join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr.is_in_project = true
    left join information.time_primitive tp on tp.pk_entity = s2.fk_object_info
    where r.fk_class = {pks.classes.death}
""")

symogih_deaths['date_gv'] = [u.from_julian_day(row['julian_day'], row['calendar']) for _, row in symogih_deaths.iterrows()]
symogih_deaths.drop(columns=['julian_day', 'calendar'], inplace=True)

a.infos(symogih_deaths)

# 3s

Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!
Shape:  (59629, 5) - extract:


Unnamed: 0,pk_gv,pk_death,pk_stmt_to_delete,pk_ipr_to_delete,date_gv
0,25503,25852,1285990.0,,"(1531, 10, 11)"
1,25503,25852,7830076.0,8264486.0,"(1484, 1, 1)"
2,149826,239671,239673.0,8289344.0,"(1955, 2, 17)"
3,870070,870080,870082.0,,"(1860, 1, 1)"
4,870070,870080,7860278.0,8294683.0,"(1780, 1, 1)"


In [8]:
persons = persons.merge(symogih_deaths, on='pk_gv', how='left')
persons['pk_stmt_to_delete'] = persons['pk_stmt_to_delete'].astype(pd.Int64Dtype())
persons['pk_ipr_to_delete'] = persons['pk_ipr_to_delete'].astype(pd.Int64Dtype())
persons['pk_death'] = persons['pk_death'].astype(pd.Int64Dtype())

persons.drop(columns=['pk_bhp', 'uri_death'], inplace=True)

persons['should_year'] = [t[0] for t in persons['date_bhp']]
persons['have_year'] = [t[0] if pd.notna(t) else pd.NA for t in persons['date_gv']]


persons = persons[persons['should_year'] != persons['have_year']]
persons

Unnamed: 0,pk_gv,date_bhp,pk_death,pk_stmt_to_delete,pk_ipr_to_delete,date_gv,should_year,have_year
0,6532778,"(1653, <NA>, <NA>)",7726515,7829746,8264156,"(1599, 1, 1)",1653,1599
1,6499432,"(1672, <NA>, <NA>)",7726516,7829747,8264157,"(1600, 1, 1)",1672,1600
2,783602,"(1643, <NA>, <NA>)",7726517,7829748,8264158,"(1581, 1, 1)",1643,1581
3,6509333,"(1635, <NA>, <NA>)",7726518,7829749,8264159,"(1600, 1, 1)",1635,1600
4,6511070,"(1746, <NA>, <NA>)",7726519,7829750,8264160,"(1681, 1, 1)",1746,1681
...,...,...,...,...,...,...,...,...
59593,6498886,"(<NA>, <NA>, <NA>)",7785870,7866233,8300638,"(1813, 1, 1)",,1813
59594,6544054,"(<NA>, <NA>, <NA>)",7785871,7866234,8300639,"(1798, 1, 1)",,1798
59698,6544132,"(1844, <NA>, <NA>)",7785975,7866338,8300743,"(1802, 1, 1)",1844,1802
59752,6544135,"(<NA>, <NA>, <NA>)",7786029,,,,,


## Correct data

In [10]:
# Remove wrong statement from project
selection = persons[pd.notna(persons['pk_ipr_to_delete'])]
values = '(' + ','.join(selection['pk_ipr_to_delete'].astype(str)) + ')'
db.execute(f"""
    update projects.info_proj_rel
    set is_in_project = false
    where pk_entity in {values};
""")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f92ca605f40>

In [21]:
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA

selection = persons[['pk_death', 'date_bhp']].copy()
selection['duration'] = [get_duration(d) for d in selection['date_bhp']]
selection.dropna(inplace=True)

selection['pk_time_primitives'] = db.time_primitives.create(selection['date_bhp'].tolist(), selection['duration'].tolist())

db.statements.create(
    selection['pk_death'], 
    pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, 
    selection['pk_time_primitives']
)

Batch creation of 16650 time_primitives  is done - Elapsed: [00h08'43]                   
Batch creation of 16650 statements  is done - Elapsed: [00h09'00]                   
