In [None]:
%load_ext autoreload
%autoreload 2

env = 'stag'
pk_project = 6857901
execute = False

import os
import pandas as pd
import numpy as np
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# Import BHP actors into Geovistory

## Record linkage result

In [None]:
# Fetch record linkage and keep only the BHP key and the GEOV key
record_linkage = pd.read_csv('../../data/record-linkage-bhp-actors-geov-persons-filled.csv')
record_linkage = record_linkage[record_linkage['doublon'] == 'oui']
record_linkage = record_linkage[['pk_bhp', 'pk_gv']]
a.set_types(record_linkage, {'pk_bhp':'int', 'pk_gv':'int'})
record_linkage = pd.concat([record_linkage, u.read_df('../../data/record-linkage-bhp-actors-geov-person-uris.csv').drop(columns=['name'])])

# For usage: prepare lists for filtering
pk_bhp_to_update = record_linkage['pk_bhp'].astype(int).tolist()
pk_gv_to_update = record_linkage['pk_gv'].astype(int).tolist()

## Fetch all information from BHP

**pk_actor**

In [None]:
actors = u.read_df('../../data/bhp/actor.csv')

**URIs**

In [None]:
# Symogih URIs
uris_symogih = pd.DataFrame()
uris_symogih['pk_actor'] = actors['pk_actor']
uris_symogih['uri'] = 'http://symogih.org/resource/Actr' + uris_symogih['pk_actor'].astype(str)

# URIs that are in BHP
db.connect_external(os.environ.get('YELLOW_BHP'))
uris_bhp = db.query('select * from bhp.documentation')
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documented_object'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documenting_entity'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['identifier'])]
uris_bhp = uris_bhp[uris_bhp['fk_documented_object'].str.contains('Actr')]
uris_bhp = uris_bhp[uris_bhp['fk_documenting_entity'].str.contains('DiOb')]
uris_bhp['fk_documented_object'] = uris_bhp['fk_documented_object'].str.replace('Actr', '')
uris_bhp = uris_bhp[['fk_documented_object', 'fk_documenting_entity', 'identifier']]
uris_bhp['fk_documenting_entity'] = uris_bhp['fk_documenting_entity'].str.replace('DiOb', '')
u.parse_df(uris_bhp)
resource_address_concat = u.parse_df(db.query('select * from bhp.resource_address_concatenation')[['fk_digital_object', 'fk_resource_address']])
resource_address = u.parse_df(db.query('select * from bhp.resource_address')[['pk_resource_address', 'uri']])
uris_bhp = uris_bhp.merge(resource_address_concat, left_on='fk_documenting_entity', right_on='fk_digital_object', how='left').drop(columns=['fk_documenting_entity', 'fk_digital_object'])
uris_bhp = uris_bhp.merge(resource_address, left_on='fk_resource_address', right_on='pk_resource_address', how='left')
uris_bhp['uri'] = uris_bhp['uri'] + uris_bhp['identifier']
uris_bhp = uris_bhp[['fk_documented_object', 'uri']]
uris_bhp.dropna(subset=['uri'], inplace=True)
uris_bhp.rename(columns={'fk_documented_object': 'pk_actor'}, inplace=True)

# All together
uris = pd.concat([uris_symogih, uris_bhp]).sort_values('pk_actor').reset_index(drop=True)

**Name**

In [None]:
names = u.read_df('../../data/bhp/actor_name.csv')[[ 'fk_actor', 'concat_name', 'lang_iso', 'begin_date', 'end_date']]
names.columns = ['pk_actor', 'name', 'lang', 'begin', 'end']
names['lang'].replace('None', 'fra', inplace=True, regex=False)

**Gender**

In [None]:
genders = actors[['pk_actor', 'gender_iso']]
genders = genders[genders['gender_iso'] != 0]
genders['gender'] = ['Male' if iso == 1 else 'Female' for iso in genders['gender_iso']]
genders.drop(columns=['gender_iso'], inplace=True)

**Definition**

In [None]:
definitions = u.read_df('../../data/bhp/actor_text_property.csv')[['fk_actor', 'lang_iso_code', 'text']]
definitions.rename(columns={'lang_iso_code': 'lang', 'fk_actor':'pk_actor'}, inplace=True)
definitions['lang'] = definitions['lang'].str.replace('None', 'fra')

## Insert new actors

**Connect to Geovistory database**

In [None]:
db.connect_geovistory(env, pk_project, execute)

**Filter out those who only need updates**

In [None]:
to_create = pd.DataFrame(columns=['pk_actor'])
to_create['pk_actor'] = actors['pk_actor']
to_create = to_create[[pk not in pk_bhp_to_update for pk in to_create['pk_actor'].tolist()]]

print('Actor nb to create:', len(to_create))

**Create Persons**

In [None]:
to_create['pk_person'] = db.resources.create(pks.classes.person, len(to_create))

**Add URIs**

In [None]:
uris_to_create = to_create.merge(uris, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'uri'])

graphs.add_uris(
    uris_to_create['pk_person'].tolist(),
    uris_to_create['uri'].tolist()
)

**Add person names**

In [None]:
# Create Person appellation in a language
names_to_create = to_create.merge(names, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'name'])
names_to_create['lang'] = [pks.languages.from_iso_code(iso_code if pd.notna(iso_code) else 'fra') for iso_code in names_to_create['lang']]

names_to_create['pk_paial'] = graphs.add_person_names(
    names_to_create['pk_person'].tolist(),
    names_to_create['name'].tolist(),
    names_to_create['lang'].tolist(),
    return_pk_paial=True
)


# For those with a begin date, add begin date
names_with_begin = names_to_create[pd.notna(names_to_create['begin'])][['pk_paial', 'begin']]
names_with_begin['begin'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_with_begin['begin']]

time_prims_begin = db.time_primitives.create(
    names_with_begin['begin'].tolist(),
    '1 day'
)
db.statements.create(
    names_with_begin['pk_paial'].tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    time_prims_begin
)


# For those with a end date, add end date
names_with_end = names_to_create[pd.notna(names_to_create['end'])][['pk_paial', 'end']]
names_with_end['end'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_with_end['end']]

time_prims_end = db.time_primitives.create(
    names_with_end['end'].tolist(),
    '1 day'
)
db.statements.create(
    names_with_end['pk_paial'].tolist(),
    pks.properties.timespan_beginOfTheEnd_timePrim,
    time_prims_end
)

**Add genders**

In [None]:
genders_to_create = to_create.merge(genders, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'gender'])
genders_to_create['gender'] = genders_to_create['gender'].replace('Male', pks.entities.pk_gender_male)
genders_to_create['gender'] = genders_to_create['gender'].replace('Female', pks.entities.pk_gender_female)

db.statements.create(
    genders_to_create['pk_person'].tolist(),
    pks.properties.person_hasGender_gender,
    genders_to_create['gender'].tolist()   
)

**Definition**

In [None]:
definitions_to_create = to_create.merge(definitions, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'text'])
definitions_to_create.dropna(subset=['pk_person', 'text'], inplace=True)
definitions_to_create['pk_lang'] = [pks.languages.from_iso_code(iso_code if pd.notna(iso_code) else 'fra') for iso_code in definitions_to_create['lang']]

graphs.add_definitions(
    definitions_to_create['pk_person'].tolist(),
    definitions_to_create['text'].tolist(),
    definitions_to_create['pk_lang'].tolist()
)

## Fetch information from Geovistory (for existing)

**Connect to Geovistory database**

In [None]:
db.connect_geovistory('prod', pk_project, execute, skip_protection=True)

**Query database**

In [None]:
values = '(' + ','.join(map(lambda s: str(s), pk_gv_to_update)) + ')'

persons = db.query(f"""
    select
        r1.pk_entity as pk_person,
        -- URIs
        s1.fk_object_info as pk_uri,
        a1.string as uri,
        s1.pk_entity as uri_pk_statement_1,
        s2.pk_entity as uri_pk_statement_2,
        a1.pk_entity as uri_pk_appellation,
        -- Names
        s3.pk_entity as name_pk_statement_3,
        s3.fk_subject_info as pk_paial,
        s4.pk_entity as name_pk_statement_4,
        a2.pk_entity as name_pk_appellation_2,
        a2.string as name,
        s5.pk_entity as name_pk_statement_5,
        s5.fk_object_info as pk_name_lang,
        s6.pk_entity as name_pk_statement_6,
        tp1.pk_entity as name_pk_tp_begin,
        tp1.julian_day as name_begin_jd,
        s7.pk_entity as name_pk_statement_7,
        tp2.pk_entity as name_pk_tp_end,
        tp2.julian_day as name_end_jd,
        -- Gender
        s8.pk_entity as gender_pk_statement,
        s8.fk_object_info as pk_gender,
        -- Definition
        s10.fk_object_info as pk_def_lang,
        a3.string as definition
    from information.resource r1
    -- URIs
    left join information.statement s1 on s1.fk_subject_info = r1.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    left join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    left join information.appellation a1 on a1.pk_entity = s2.fk_object_info
    -- Names
    left join information.statement s3 on s3.fk_object_info = r1.pk_entity and s3.fk_property = {pks.properties.apial_isAppelationForLanguageOf_entity}
    left join information.statement s4 on s4.fk_subject_info = s3.fk_subject_info and s4.fk_property = {pks.properties.aial_refersToName_appellation}
    left join information.appellation a2 on a2.pk_entity = s4.fk_object_info
    left join information.statement s5 on s5.fk_subject_info = s3.fk_subject_info and s5.fk_property = {pks.properties.apial_usedInLanguage_language}
    left join information.statement s6 on s6.fk_subject_info = s3.fk_subject_info and s6.fk_property = {pks.properties.timespan_endOfTheBegin_timePrim}
    left join information.time_primitive tp1 on tp1.pk_entity = s6.fk_object_info
    left join information.statement s7 on s7.fk_subject_info = s3.fk_subject_info and s7.fk_property = {pks.properties.timespan_beginOfTheEnd_timePrim}
    left join information.time_primitive tp2 on tp2.pk_entity = s7.fk_object_info
    -- Genders
    left join information.statement s8 on s8.fk_subject_info = r1.pk_entity and s8.fk_property = {pks.properties.person_hasGender_gender}
    -- Definitions
    left join information.statement s9 on s9.fk_subject_info = r1.pk_entity and s9.fk_property = {pks.properties.entity_hasDefinition_text}
    left join information.statement s10 on s10.fk_subject_info = s9.fk_object_info and s10.fk_property = {pks.properties.linguisticObj_hasLanguage_language}
    left join information.statement s11 on s11.fk_subject_info = s9.fk_object_info and s11.fk_property = {pks.properties.text_hasValueVersion_string}
    left join information.appellation a3 on a3.pk_entity = s11.fk_object_info

    where r1.pk_entity in {values}
""")

a.set_types(persons, {'pk_name_lang':'int', 'name_begin_jd':'int', 'name_end_jd':'int', 'pk_gender':'int', 'pk_def_lang':'int', 'definition':'string'})
# URI                   
a.set_types(persons, {'pk_uri': 'int', 'uri':'string', 'uri_pk_statement_1':'int', 'uri_pk_statement_2':'int', 'uri_pk_appellation':'int'})

# Join the pk bhp
persons = persons.merge(record_linkage, left_on='pk_person', right_on='pk_gv').drop(columns=['pk_gv']).drop_duplicates()

a.infos(persons, random=True)

**Disconnect from production**

In [None]:
db.disconnect()

## Complete existing information

**Connect to Geovistory database**

In [None]:
db.connect_geovistory(env, pk_project, execute)

**Add person to project**

In [None]:
db.info_proj_rels.create(pk_gv_to_update)

**URIs: add to project existing, create non existing**

Existing:

In [None]:
# From the record linkage, we get the URIs that we want to create (or add to the project)
record_linkage_uri_bhp = record_linkage.merge(uris, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])

# Filter already existing URIs:
uris_gv = persons[['pk_bhp', 'pk_person', 'pk_uri', 'uri', 'uri_pk_statement_1', 'uri_pk_statement_2', 'uri_pk_appellation']].dropna().drop_duplicates()
uris_add_to_project = record_linkage_uri_bhp.merge(uris_gv.drop(columns=['pk_bhp', 'pk_person']), on='uri', how='inner')

# Add info to project
db.info_proj_rels.create(uris_add_to_project['pk_uri'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_statement_1'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_statement_2'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_appellation'].astype(int).tolist())

Non existing:

In [None]:
uri_existing = uris_add_to_project['uri'].tolist()
uris_to_create = record_linkage_uri_bhp[[u not in uri_existing for u in record_linkage_uri_bhp['uri']]]

graphs.add_uris(
    uris_to_create['pk_gv'].astype(int).tolist(),
    uris_to_create['uri'].astype(str).tolist()
)

**Names: add to project existing, create non existing**

In [None]:
names_gv = persons[['pk_person', 'pk_paial', 'pk_name_lang', 'name', 'name_begin_jd', 'name_end_jd', 'name_pk_statement_3', 'name_pk_statement_4', 'name_pk_appellation_2', 'name_pk_statement_5', 'name_pk_statement_6', 'name_pk_tp_begin', 'name_pk_statement_7', 'name_pk_tp_end']]
names_add_to_project = []
names_to_create = []

for i, row in record_linkage.iterrows():
    select_bhp = names[names['pk_actor'] == row['pk_bhp']].drop_duplicates(subset=['name', 'lang'])
    select_gv = names_gv[names_gv['pk_person'] == row['pk_gv']].drop_duplicates(subset=['name', 'pk_name_lang'])

    for j, row_bhp in select_bhp.iterrows(): 
        lang = pks.languages.from_iso_code(row_bhp['lang']) if pd.notna(row_bhp['lang']) else pks.languages.from_iso_code('fra')
        name_bhp = ' '.join(sorted(row_bhp['name'].replace(',', '').lower().split(' '))) + ' - ' + str(lang)
        # print(name_bhp)
        found = False

        if pd.isna(row_bhp['begin']) and pd.isna(row_bhp['end']):
            for k, row_gv in select_gv.iterrows():
                name_gv = ' '.join(sorted(row_gv['name'].replace(',', '').lower().split(' '))) + ' - ' + str(row_gv['pk_name_lang'])
                # print(name_gv)
                if name_bhp == name_gv:
                    found = True
                    names_add_to_project.append({
                        'pk_person': row['pk_gv'],
                        'name': row_bhp['name'],
                        'begin': row_bhp['begin'],
                        'end': row_bhp['end'],
                        'pk_paial': row_gv['pk_paial'],
                        'pk_tp_begin': row_gv['name_pk_tp_begin'],
                        'pk_tp_end': row_gv['name_pk_tp_end'],
                        'stmt3': row_gv['name_pk_statement_3'],
                        'stmt4': row_gv['name_pk_statement_4'],
                        'appe2': row_gv['name_pk_appellation_2'],
                        'stmt5': row_gv['name_pk_statement_5'],
                        'stmt6': row_gv['name_pk_statement_6'],
                        'stmt7': row_gv['name_pk_statement_7'],
                    })
        
        if not found:
            names_to_create.append({
                'pk_person': row['pk_gv'],
                'name': row_bhp['name'],
                'pk_lang': lang,
                'begin': row_bhp['begin'],
                'end': row_bhp['end'],
            })

    # display(select_bhp)
    # display(select_gv)

    
names_to_create = pd.DataFrame(data=names_to_create)
names_add_to_project = pd.DataFrame(data=names_add_to_project)

In [None]:
# Create information - new names
names_to_create['pk_paial'] = graphs.add_person_names(
    names_to_create['pk_person'].astype(int).tolist(),
    names_to_create['name'].astype(str).tolist(),
    names_to_create['pk_lang'].astype(int).tolist()
)

# Create information - new names - begin
names_begin = names_to_create[pd.notna(names_to_create['begin'])].copy()
names_begin['begin'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_begin['begin']]
names_begin.dropna(subset=['pk_paial', 'end'], inplace=True)
tp_begin = db.time_primitives.create(names_begin['begin'].tolist(), '1 day')
db.statements.create(
    names_begin['pk_paial'].astype(int).tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    tp_begin    
)

# Create information - new names - end
names_end = names_to_create[pd.notna(names_to_create['end'])].copy()
names_end['end'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_end['end']]
names_end.dropna(subset=['pk_paial', 'end'], inplace=True)
tp_end = db.time_primitives.create(names_end['end'].tolist(), '1 day')
db.statements.create(
    names_end['pk_paial'].astype(int).tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    tp_end    
)


# Create information - add to project - names
db.info_proj_rels.create(names_add_to_project['stmt3'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['pk_paial'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['stmt4'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['appe2'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['stmt5'].astype(int).tolist())

# Create information - add to project - begin
names_add_to_project_dates_begin = names_add_to_project.dropna(subset=['pk_tp_begin'])
db.info_proj_rels.create(names_add_to_project_dates_begin['stmt6'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project_dates_begin['pk_tp_begin'].astype(int).tolist())

# Create information - add to project - end
names_add_to_project_dates_end = names_add_to_project.dropna(subset=['pk_tp_end'])
db.info_proj_rels.create(names_add_to_project_dates_end['stmt7'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project_dates_end['pk_tp_end'].astype(int).tolist())

**Gender: add to project existing, create non existing**

In [None]:
record_linkage_gender_bhp = record_linkage.merge(genders, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])
genders_gv = persons[['pk_person', 'pk_gender', 'gender_pk_statement']].drop_duplicates()

genders_to_update = record_linkage_gender_bhp.merge(genders_gv, left_on='pk_gv', right_on='pk_gender', how='left')
genders_to_update['gender'] = genders_to_update['gender'].replace('Male', pks.entities.pk_gender_male)
genders_to_update['gender'] = genders_to_update['gender'].replace('Female', pks.entities.pk_gender_female)

genders_add_to_project = genders_to_update[pd.notna(genders_to_update['pk_gender'])]
genders_to_create = genders_to_update[pd.isna(genders_to_update['pk_gender'])]

# Add existing
if len(genders_add_to_project) > 0: db.info_proj_rels.create(genders_add_to_project['gender_pk_statement'].astype(int).tolist())

# Create new ones
db.statements.create(
    genders_to_create['pk_gv'].astype(int).tolist(),
    pks.properties.person_hasGender_gender,
    genders_to_create['gender'].astype(int).tolist()
)

**Definitions**

In [None]:
record_linkage_definition_bhp = record_linkage.merge(definitions, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])
record_linkage_definition_bhp['pk_lang'] = [pks.languages.from_iso_code(code) for code in record_linkage_definition_bhp['lang']]

# Create new definitions
graphs.add_definitions(
    record_linkage_definition_bhp['pk_gv'].astype(int).tolist(),
    record_linkage_definition_bhp['text'].astype(str).tolist(),
    record_linkage_definition_bhp['pk_lang'].astype(int).tolist(),
)