In [48]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = 6857901
execute = True
metadata_str = 'bhp-actors-births'
import_manner = 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
>> Connecting to PRODUCTION Database ... Connected!


In [49]:
db.globals.metadata

{'import-id': '20230531-bhp-actors-births'}

# Import BHP actors' birth into Geovistory

## Fetch data

### Junction table: pk BHP <=> pk Geovistory

In [2]:
persons = db.query(f"""
    select distinct
        r.pk_entity as pk_gv,
        a3.string as uri
    from information.resource r
    inner join projects.info_proj_rel ipr on ipr.fk_entity = r.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    -- URI
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = a3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    where r.fk_class = {pks.classes.person}
""")

persons = persons[persons.uri.str.contains('symogih.org')]
persons['pk_bhp'] = persons.uri.str.replace('http://symogih.org/resource/Actr', '', regex=False).astype(int)
persons.drop(columns=['uri'], inplace=True)
persons.sort_values('pk_bhp', inplace=True)
persons.drop_duplicates(inplace=True)
persons.reset_index(inplace=True, drop=True)
persons = persons[['pk_bhp', 'pk_gv']]

a.infos(persons)

# 2s

Shape:  (59526, 2) - extract:


Unnamed: 0,pk_bhp,pk_gv
0,1,6532778
1,2,6499432
2,3,783602
3,4,6509333
4,5,6511070


### Add infos from BHP - Full births

In [3]:
db.connect_external(os.environ.get('YELLOW_BHP'))

actor_list = persons.pk_bhp.tolist()
actor_list = [('Actr' + str(pk)) for pk in actor_list]

real_birth_bhp = db.query(f"""
    select 
        ir.fk_associated_object as pk_bhp,
        ir.fk_information as fk_info,
        id.year, id.month, id.day,
        id.fk_abob_type_information_date
    from bhp.information_role ir
    inner join bhp.information_date id on ir.fk_information = id.fk_information
    where ir.fk_type_role = 40
""")
real_birth_bhp = real_birth_bhp[pd.notna(real_birth_bhp['pk_bhp'])]                          
real_birth_bhp = real_birth_bhp[real_birth_bhp['pk_bhp'].str.contains('Actr')]
real_birth_bhp['pk_bhp'] = real_birth_bhp['pk_bhp'].str.replace('Actr', '', regex=False)
real_birth_bhp['pk_bhp'] = real_birth_bhp['pk_bhp'].astype(pd.Int64Dtype())
real_birth_bhp['year'] = real_birth_bhp['year'].astype(pd.Int64Dtype())
real_birth_bhp['month'] = real_birth_bhp['month'].astype(pd.Int64Dtype())
real_birth_bhp['day'] = real_birth_bhp['day'].astype(pd.Int64Dtype())
real_birth_bhp['date_bhp'] = [(row.year, row.month, row.day) for i, row in real_birth_bhp.iterrows()]
real_birth_bhp['uri_birth'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in real_birth_bhp['fk_info']]

# For now we only import unique data ie: id.fk_abob_type_information_date == 246
real_birth_bhp = real_birth_bhp[real_birth_bhp['fk_abob_type_information_date'] == 246]

real_birth_bhp.drop(columns=['year', 'month', 'day', 'fk_info', 'fk_abob_type_information_date'], inplace=True)

a.infos(real_birth_bhp)

# 2s

>> Connecting to PGSQL Database ... Connected!
Shape:  (18446, 3) - extract:


Unnamed: 0,pk_bhp,date_bhp,uri_birth
0,59480,"(1884, 8, 27)",http://symogih.org/resource/Info530
1,46711,"(1881, 10, 4)",http://symogih.org/resource/Info531
2,21382,"(1787, 7, 4)",http://symogih.org/resource/Info1125
3,579,"(1650, <NA>, <NA>)",http://symogih.org/resource/Info1162
4,630,"(1619, 10, 12)",http://symogih.org/resource/Info1328


In [4]:
# Merge to main table
persons = persons.merge(real_birth_bhp, on='pk_bhp', how='left')

a.infos(persons, random=True)

Shape:  (59587, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_birth
6812,7050,6525468,,
20295,20850,6508002,"(1793, 10, 1)",http://symogih.org/resource/Info18312
57052,61106,6537215,,
23512,24191,6505441,"(1714, 10, 16)",http://symogih.org/resource/Info19642
59088,63153,6541101,"(1688, 12, <NA>)",http://symogih.org/resource/Info164097


### Add infos from BHP - Small birth

In [5]:
actors = u.read_df('../../data/bhp/actor.csv')
actors = actors[['pk_actor', 'begin_year']]
actors.rename(inplace=True, columns={'pk_actor':'pk_bhp'})
actors['begin_year'] = [(year, pd.NA, pd.NA) for year in actors.begin_year]

In [6]:
# Merge to main table
persons = persons.merge(actors, on='pk_bhp', how='left')
persons['date_bhp'] = [row['date_bhp'] if pd.notna(row['date_bhp']) else row.begin_year for i, row in persons.iterrows()]
persons.drop(columns=['begin_year'], inplace=True)

a.infos(persons)

# 2s

Shape:  (59587, 4) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_birth
0,1,6532778,"(1599, <NA>, <NA>)",http://symogih.org/resource/Info8034
1,2,6499432,"(1600, <NA>, <NA>)",
2,3,783602,"(1581, <NA>, <NA>)",
3,4,6509333,"(1600, <NA>, <NA>)",
4,5,6511070,"(1681, <NA>, <NA>)",http://symogih.org/resource/Info32594


### Add existing birth from geovistory

In [7]:
values = '(' + ','.join([str(pk) for pk in persons.pk_gv.tolist()]) + ')'

db.connect_geovistory(env, pk_project, False, skip_protection=True)
gv_births = db.query(f"""
    select distinct
        s1.fk_object_info as pk_person,
        s1.pk_entity as pk_stmt_person_to_birth,
        s1.fk_subject_info as pk_birth
    from information.statement s1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.is_in_project = true
    left join information.statement s2 on s2.fk_subject_info = s1.fk_subject_info and s2.fk_property = {pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}
    where s1.fk_object_info in {values}
      and s1.fk_property = {pks.properties.birth_broughtIntoLife_person}
""")

a.infos(gv_births)

# 2s

Requests will not be executed
>> Connecting to PRODUCTION Database ... Connected!
Shape:  (1171, 3) - extract:


Unnamed: 0,pk_person,pk_stmt_person_to_birth,pk_birth
0,786108,813029,809279
1,784475,811396,807646
2,149826,149839,149838
3,786873,813794,810044
4,26811,72101,69101


In [8]:
persons = persons.merge(gv_births, left_on='pk_gv', right_on='pk_person', how='left').drop(columns=['pk_person'])
persons['uri_birth'] = persons['uri_birth'].astype(pd.StringDtype())
persons['pk_stmt_person_to_birth'] = persons['pk_stmt_person_to_birth'].astype(pd.Int64Dtype())
persons['pk_birth'] = persons['pk_birth'].astype(pd.Int64Dtype())

persons

Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_birth,pk_stmt_person_to_birth,pk_birth
0,1,6532778,"(1599, <NA>, <NA>)",http://symogih.org/resource/Info8034,,
1,2,6499432,"(1600, <NA>, <NA>)",,,
2,3,783602,"(1581, <NA>, <NA>)",,810523,806773
3,4,6509333,"(1600, <NA>, <NA>)",,,
4,5,6511070,"(1681, <NA>, <NA>)",http://symogih.org/resource/Info32594,,
...,...,...,...,...,...,...
59582,63646,6544154,"(1709, <NA>, <NA>)",http://symogih.org/resource/Info166193,,
59583,63647,6543685,"(1711, 5, <NA>)",http://symogih.org/resource/Info166197,,
59584,63648,6543708,"(1728, <NA>, <NA>)",http://symogih.org/resource/Info166201,,
59585,63649,6544155,"(1726, <NA>, <NA>)",http://symogih.org/resource/Info166205,,


## Import data

In [9]:
db.connect_geovistory(env, pk_project, execute)

>> Connecting to PRODUCTION Database ... Connected!


### Birth - Add existing to project

In [10]:
to_add = []
to_add += persons[pd.notna(persons['pk_stmt_person_to_birth'])]['pk_stmt_person_to_birth'].tolist()
to_add += persons[pd.notna(persons['pk_birth'])]['pk_birth'].tolist()
db.info_proj_rels.create(to_add)

Creating info_proj_rel of 2352 entities with project <6857901> ... Done in [00h00'02]


### Birth - Create new

In [11]:
selection = persons[pd.isna(persons['pk_birth'])][['pk_gv']].copy()
selection['pk_birth_new'] = db.resources.create(pks.classes.birth, len(selection))

Batch creation of 58411 entities  is done - Elapsed: [00h32'58]                   


In [12]:
persons = persons.merge(selection, on='pk_gv', how='left')
persons['pk_birth'] = [row['pk_birth'] if pd.notna(row.pk_birth) else row['pk_birth_new'] for _, row in persons.iterrows()]
persons['pk_birth'] = persons['pk_birth'].astype(pd.Int64Dtype())
persons.drop(columns=['pk_birth_new'], inplace=True)

a.infos(persons)

Shape:  (59709, 6) - extract:


Unnamed: 0,pk_bhp,pk_gv,date_bhp,uri_birth,pk_stmt_person_to_birth,pk_birth
0,1,6532778,"(1599, <NA>, <NA>)",http://symogih.org/resource/Info8034,,7489054
1,2,6499432,"(1600, <NA>, <NA>)",,,7489055
2,3,783602,"(1581, <NA>, <NA>)",,810523.0,806773
3,4,6509333,"(1600, <NA>, <NA>)",,,7489056
4,5,6511070,"(1681, <NA>, <NA>)",http://symogih.org/resource/Info32594,,7489057


### Birth - Add URI

In [22]:
selection = persons[pd.notna(persons['uri_birth'])]
graphs.add_uris(selection['pk_birth'].tolist(), selection['uri_birth'].tolist())

Batch creation of 18565 entities  is done - Elapsed: [00h20'50]                   
Batch creation of 18565 appellations  is done - Elapsed: [00h20'09]                   
Batch creation of 18565 statements  is done - Elapsed: [00h20'28]                   
Batch creation of 18565 statements  is done - Elapsed: [00h20'27]                   


### Birth - Add date

In [46]:
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA


# Compute durations
selection = persons[['pk_birth', 'date_bhp']].copy()
selection['duration'] = [get_duration(d) for d in selection['date_bhp']]
selection.dropna(inplace=True)

# Create Time primitive
selection['pk_time_primitives'] = db.time_primitives.create(selection['date_bhp'].tolist(), selection['duration'].tolist())

# Birth has time primitive
db.statements.create(
    selection['pk_birth'], 
    pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, 
    selection['pk_time_primitives']
)

Batch creation of 37127 time_primitives  is done - Elapsed: [00h26'34]                   
Batch creation of 37127 statements  - Elapsed: [00h20'59] - ETA [08h18'30] -   4.04%

KeyboardInterrupt: 

### Birth - Link birth to persons

In [50]:
selection = persons[['pk_gv', 'pk_birth', 'pk_stmt_person_to_birth']]
selection = selection[pd.isna(selection['pk_stmt_person_to_birth'])].drop(columns=['pk_stmt_person_to_birth'])

db.statements.create(
    selection['pk_birth'].tolist(), 
    pks.properties.birth_broughtIntoLife_person, 
    selection['pk_gv'].tolist()
)

Batch creation of 58533 statements  is done - Elapsed: [00h32'58]                   
