In [1]:
%load_ext autoreload
%autoreload 2

env = ''
pk_project = 6857901
execute = True

import os
import pandas as pd
import numpy as np
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# Import BHP actors into Geovistory

## Record linkage result

In [2]:
# Fetch record linkage and keep only the BHP key and the GEOV key
record_linkage = pd.read_csv('../../data/record-linkage-bhp-actors-geov-persons-filled.csv')
record_linkage = record_linkage[record_linkage['doublon'] == 'oui']
record_linkage = record_linkage[['pk_bhp', 'pk_gv']]
a.set_types(record_linkage, {'pk_bhp':'int', 'pk_gv':'int'})
record_linkage = pd.concat([record_linkage, u.read_df('../../data/record-linkage-bhp-actors-geov-person-uris.csv').drop(columns=['name'])])

# For usage: prepare lists for filtering
pk_bhp_to_update = record_linkage['pk_bhp'].astype(int).tolist()
pk_gv_to_update = record_linkage['pk_gv'].astype(int).tolist()

## Fetch all information from BHP

**pk_actor**

In [3]:
actors = u.read_df('../../data/bhp/actor.csv')

**URIs**

In [4]:
# Symogih URIs
uris_symogih = pd.DataFrame()
uris_symogih['pk_actor'] = actors['pk_actor']
uris_symogih['uri'] = 'http://symogih.org/resource/Actr' + uris_symogih['pk_actor'].astype(str)

# URIs that are in BHP
db.connect_external(os.environ.get('YELLOW_BHP'))
uris_bhp = db.query('select * from bhp.documentation')
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documented_object'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['fk_documenting_entity'])]
uris_bhp = uris_bhp[pd.notna(uris_bhp['identifier'])]
uris_bhp = uris_bhp[uris_bhp['fk_documented_object'].str.contains('Actr')]
uris_bhp = uris_bhp[uris_bhp['fk_documenting_entity'].str.contains('DiOb')]
uris_bhp['fk_documented_object'] = uris_bhp['fk_documented_object'].str.replace('Actr', '')
uris_bhp = uris_bhp[['fk_documented_object', 'fk_documenting_entity', 'identifier']]
uris_bhp['fk_documenting_entity'] = uris_bhp['fk_documenting_entity'].str.replace('DiOb', '')
u.parse_df(uris_bhp)
resource_address_concat = u.parse_df(db.query('select * from bhp.resource_address_concatenation')[['fk_digital_object', 'fk_resource_address']])
resource_address = u.parse_df(db.query('select * from bhp.resource_address')[['pk_resource_address', 'uri']])
uris_bhp = uris_bhp.merge(resource_address_concat, left_on='fk_documenting_entity', right_on='fk_digital_object', how='left').drop(columns=['fk_documenting_entity', 'fk_digital_object'])
uris_bhp = uris_bhp.merge(resource_address, left_on='fk_resource_address', right_on='pk_resource_address', how='left')
uris_bhp['uri'] = uris_bhp['uri'] + uris_bhp['identifier']
uris_bhp = uris_bhp[['fk_documented_object', 'uri']]
uris_bhp.dropna(subset=['uri'], inplace=True)
uris_bhp.rename(columns={'fk_documented_object': 'pk_actor'}, inplace=True)

# All together
uris = pd.concat([uris_symogih, uris_bhp]).sort_values('pk_actor').reset_index(drop=True)

>> Connecting to PGSQL Database ... Connected!


**Name**

In [5]:
names = u.read_df('../../data/bhp/actor_name.csv')[[ 'fk_actor', 'concat_name', 'lang_iso', 'begin_date', 'end_date']]
names.columns = ['pk_actor', 'name', 'lang', 'begin', 'end']
names['lang'].replace('None', 'fra', inplace=True, regex=False)

**Gender**

In [6]:
genders = actors[['pk_actor', 'gender_iso']]
genders = genders[genders['gender_iso'] != 0]
genders['gender'] = ['Male' if iso == 1 else 'Female' for iso in genders['gender_iso']]
genders.drop(columns=['gender_iso'], inplace=True)

**Definition**

In [7]:
definitions = u.read_df('../../data/bhp/actor_text_property.csv')[['fk_actor', 'lang_iso_code', 'text']]
definitions.rename(columns={'lang_iso_code': 'lang', 'fk_actor':'pk_actor'}, inplace=True)
definitions['lang'] = definitions['lang'].str.replace('None', 'fra')

## Insert new actors

**Connect to Geovistory database**

In [8]:
db.connect_geovistory(env, pk_project, execute)

>> Connecting to PRODUCTION Database ... Connected!


**Filter out those who only need updates**

In [9]:
to_create = pd.DataFrame(columns=['pk_actor'])
to_create['pk_actor'] = actors['pk_actor']
to_create = to_create[[pk not in pk_bhp_to_update for pk in to_create['pk_actor'].tolist()]]

print('Actor nb to create:', len(to_create))

# 1m10s

Actor nb to create: 58344


**Create Persons**

In [10]:
to_create['pk_person'] = db.resources.create(pks.classes.person, len(to_create))

# 14s

Creating 58344 resources of class [21] ... Done in [00h00'04]
Creating info_proj_rel of 58344 entities with project <6857901> ... Done in [00h00'12]


**Add URIs**

In [11]:
uris_to_create = to_create.merge(uris, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'uri'])

graphs.add_uris(
    uris_to_create['pk_person'].tolist(),
    uris_to_create['uri'].tolist()
)

# 1m50s

Creating 63521 resources of class [967] ... Done in [00h00'05]
Creating info_proj_rel of 63521 entities with project <6857901> ... Done in [00h00'12]
Creating 63521 appellations ... Done in [00h00'31]
Creating info_proj_rel of 63521 entities with project <6857901> ... Done in [00h00'13]
Creating 63521 statements ... Done in [00h00'10]
Creating info_proj_rel of 63521 entities with project <6857901> ... Done in [00h00'13]
Creating 63521 statements ... Done in [00h00'09]
Creating info_proj_rel of 63521 entities with project <6857901> ... Done in [00h00'12]


**Add person names**

In [12]:
# Create Person appellation in a language
names_to_create = to_create.merge(names, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'name'])
names_to_create['lang'] = [pks.languages.from_iso_code(iso_code if pd.notna(iso_code) else 'fra') for iso_code in names_to_create['lang']]

names_to_create['pk_paial'] = graphs.add_person_names(
    names_to_create['pk_person'].tolist(),
    names_to_create['name'].tolist(),
    names_to_create['lang'].tolist(),
    return_pk_paial=True
)


# For those with a begin date, add begin date
names_with_begin = names_to_create[pd.notna(names_to_create['begin'])][['pk_paial', 'begin']]
names_with_begin['begin'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_with_begin['begin']]

time_prims_begin = db.time_primitives.create(
    names_with_begin['begin'].tolist(),
    '1 day'
)
db.statements.create(
    names_with_begin['pk_paial'].tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    time_prims_begin
)


# For those with a end date, add end date
names_with_end = names_to_create[pd.notna(names_to_create['end'])][['pk_paial', 'end']]
names_with_end['end'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_with_end['end']]

time_prims_end = db.time_primitives.create(
    names_with_end['end'].tolist(),
    '1 day'
)
db.statements.create(
    names_with_end['pk_paial'].tolist(),
    pks.properties.timespan_beginOfTheEnd_timePrim,
    time_prims_end
)

# 3m3s

Creating 63942 resources of class [868] ... Done in [00h00'04]
Creating info_proj_rel of 63942 entities with project <6857901> ... Done in [00h00'13]
Creating 63942 appellations ... Done in [00h00'15]
Creating info_proj_rel of 63942 entities with project <6857901> ... Done in [00h00'13]
Creating 63942 statements ... Done in [00h00'10]
Creating info_proj_rel of 63942 entities with project <6857901> ... Done in [00h00'13]
Creating 63942 statements ... Done in [00h01'26]
Creating info_proj_rel of 63942 entities with project <6857901> ... Done in [00h00'12]
Creating 63942 statements ... Done in [00h00'10]
Creating info_proj_rel of 63942 entities with project <6857901> ... Done in [00h00'13]
Creating 844 time primitives ... Done in [00h00'00]
Creating info_proj_rel of 844 entities with project <6857901> ... Done in [00h00'00]
Creating 844 statements ... Done in [00h00'00]
Creating info_proj_rel of 844 entities with project <6857901> ... Done in [00h00'01]
Creating 341 time primitives ... Do

**Add genders**

In [13]:
genders_to_create = to_create.merge(genders, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'gender'])
genders_to_create['gender'] = genders_to_create['gender'].replace('Male', pks.entities.pk_gender_male)
genders_to_create['gender'] = genders_to_create['gender'].replace('Female', pks.entities.pk_gender_female)

db.statements.create(
    genders_to_create['pk_person'].tolist(),
    pks.properties.person_hasGender_gender,
    genders_to_create['gender'].tolist()   
)

# 2m25s

Creating 56672 statements ... Done in [00h02'23]
Creating info_proj_rel of 56672 entities with project <6857901> ... Done in [00h00'11]


**Definition**

In [14]:
definitions_to_create = to_create.merge(definitions, on="pk_actor", how="inner").drop_duplicates().dropna(subset=['pk_person', 'text'])
definitions_to_create.dropna(subset=['pk_person', 'text'], inplace=True)
definitions_to_create['pk_lang'] = [pks.languages.from_iso_code(iso_code if pd.notna(iso_code) else 'fra') for iso_code in definitions_to_create['lang']]

# To be sure
definitions_to_create = definitions_to_create[~definitions_to_create['text'].str.contains('<xml>')]

graphs.add_definitions(
    definitions_to_create['pk_person'].tolist(),
    definitions_to_create['text'].tolist(),
    definitions_to_create['pk_lang'].tolist()
)

# 

Creating 52162 resources of class [899] ... Done in [00h00'03]
Creating info_proj_rel of 52162 entities with project <6857901> ... Done in [00h00'11]
Creating 52162 appellations ... Done in [00h06'41]
Creating info_proj_rel of 52162 entities with project <6857901> ... Done in [00h00'10]
Creating 52162 statements ... Done in [00h00'08]
Creating info_proj_rel of 52162 entities with project <6857901> ... Done in [00h00'11]
Creating 52162 statements ... Done in [00h00'34]
Creating info_proj_rel of 52162 entities with project <6857901> ... Done in [00h00'11]
Creating 52162 statements ... Done in [00h00'08]
Creating info_proj_rel of 52162 entities with project <6857901> ... Done in [00h00'11]


## Fetch information from Geovistory (for existing)

**Connect to Geovistory database**

In [15]:
db.connect_geovistory('prod', pk_project, execute, skip_protection=True)

>> Connecting to PRODUCTION Database ... Connected!


**Query database**

In [16]:
values = '(' + ','.join(map(lambda s: str(s), pk_gv_to_update)) + ')'

persons = db.query(f"""
    select
        r1.pk_entity as pk_person,
        -- URIs
        s1.fk_object_info as pk_uri,
        a1.string as uri,
        s1.pk_entity as uri_pk_statement_1,
        s2.pk_entity as uri_pk_statement_2,
        a1.pk_entity as uri_pk_appellation,
        -- Names
        s3.pk_entity as name_pk_statement_3,
        s3.fk_subject_info as pk_paial,
        s4.pk_entity as name_pk_statement_4,
        a2.pk_entity as name_pk_appellation_2,
        a2.string as name,
        s5.pk_entity as name_pk_statement_5,
        s5.fk_object_info as pk_name_lang,
        s6.pk_entity as name_pk_statement_6,
        tp1.pk_entity as name_pk_tp_begin,
        tp1.julian_day as name_begin_jd,
        s7.pk_entity as name_pk_statement_7,
        tp2.pk_entity as name_pk_tp_end,
        tp2.julian_day as name_end_jd,
        -- Gender
        s8.pk_entity as gender_pk_statement,
        s8.fk_object_info as pk_gender,
        -- Definition
        s9.pk_entity as pk_def_statement1,
        s10.pk_entity as pk_def_statement2,
        s11.pk_entity as pk_def_statement3,
        a3.pk_entity as pk_def_appe,
        s9.fk_object_info as pk_def, 
        s10.fk_object_info as pk_def_lang,
        a3.string as definition
    from information.resource r1
    -- URIs
    left join information.statement s1 on s1.fk_subject_info = r1.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    left join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    left join information.appellation a1 on a1.pk_entity = s2.fk_object_info
    -- Names
    left join information.statement s3 on s3.fk_object_info = r1.pk_entity and s3.fk_property = {pks.properties.apial_isAppelationForLanguageOf_entity}
    left join information.statement s4 on s4.fk_subject_info = s3.fk_subject_info and s4.fk_property = {pks.properties.aial_refersToName_appellation}
    left join information.appellation a2 on a2.pk_entity = s4.fk_object_info
    left join information.statement s5 on s5.fk_subject_info = s3.fk_subject_info and s5.fk_property = {pks.properties.apial_usedInLanguage_language}
    left join information.statement s6 on s6.fk_subject_info = s3.fk_subject_info and s6.fk_property = {pks.properties.timespan_endOfTheBegin_timePrim}
    left join information.time_primitive tp1 on tp1.pk_entity = s6.fk_object_info
    left join information.statement s7 on s7.fk_subject_info = s3.fk_subject_info and s7.fk_property = {pks.properties.timespan_beginOfTheEnd_timePrim}
    left join information.time_primitive tp2 on tp2.pk_entity = s7.fk_object_info
    -- Genders
    left join information.statement s8 on s8.fk_subject_info = r1.pk_entity and s8.fk_property = {pks.properties.person_hasGender_gender}
    -- Definitions
    left join information.statement s9 on s9.fk_subject_info = r1.pk_entity and s9.fk_property = {pks.properties.entity_hasDefinition_text}
    left join information.statement s10 on s10.fk_subject_info = s9.fk_object_info and s10.fk_property = {pks.properties.linguisticObj_hasLanguage_language}
    left join information.statement s11 on s11.fk_subject_info = s9.fk_object_info and s11.fk_property = {pks.properties.text_hasValueVersion_string}
    left join information.appellation a3 on a3.pk_entity = s11.fk_object_info

    where r1.pk_entity in {values}
""")

a.set_types(persons, {'pk_name_lang':'int', 'name_begin_jd':'int', 'name_end_jd':'int', 'pk_gender':'int', 'pk_def_lang':'int', 'definition':'string'})
# URI                   
a.set_types(persons, {'pk_uri': 'int', 'uri':'string', 'uri_pk_statement_1':'int', 'uri_pk_statement_2':'int', 'uri_pk_appellation':'int'})

# Join the pk bhp
persons = persons.merge(record_linkage, left_on='pk_person', right_on='pk_gv').drop(columns=['pk_gv']).drop_duplicates()

a.infos(persons, random=True)

Shape:  (5628, 29) - extract:


Unnamed: 0,pk_person,pk_uri,uri,uri_pk_statement_1,uri_pk_statement_2,uri_pk_appellation,name_pk_statement_3,pk_paial,name_pk_statement_4,name_pk_appellation_2,...,gender_pk_statement,pk_gender,pk_def_statement1,pk_def_statement2,pk_def_statement3,pk_def_appe,pk_def,pk_def_lang,definition,pk_bhp
5495,787095,6454282,http://d-nb.info/gnd/118974726,6468376,6475423,6461329,802027,790845,805777,795578,...,,,1238413.0,2298910.0,2325694.0,2272126.0,2245342.0,19008,Prêtre. Cartographe. Mathématicien. Professeur...,51730
4115,786290,6452622,http://d-nb.info/gnd/118740016,6466716,6473763,6459669,801222,790040,804972,795450,...,,,1237444.0,2298150.0,2324934.0,2271366.0,2244582.0,19008,Docteur en médecine de l'Université de Paris. ...,43618
4459,786290,6452966,http://wikidata.org/entity/Q347699,6467060,6474107,6460013,801222,790040,804972,795450,...,,,1237444.0,2298150.0,2324934.0,2271366.0,2244582.0,19008,Docteur en médecine de l'Université de Paris. ...,43618
4579,786361,6434140,http://data.bnf.fr/ark:/12148/cb10633692p#about,6441728,6445522,6437934,801293,790111,805043,796528,...,,,1239438.0,2299716.0,2326500.0,2272932.0,2246148.0,19008,Jésuite (à partir de 1628). Professeur de phil...,2458
3728,786290,6452235,http://d-nb.info/gnd/118740016,6466329,6473376,6459282,801222,790040,804972,795450,...,,,1237444.0,2298150.0,2324934.0,2271366.0,2244582.0,19008,Docteur en médecine de l'Université de Paris. ...,43618


**Disconnect from production**

In [17]:
db.disconnect()

Database correctly disconnected.


## Complete existing information

**Connect to Geovistory database**

In [18]:
db.connect_geovistory(env, pk_project, execute)

>> Connecting to PRODUCTION Database ... Connected!


**Add person to project**

In [19]:
db.info_proj_rels.create(pk_gv_to_update)

Creating info_proj_rel of 1222 entities with project <6857901> ... Done in [00h00'01]


**URIs: add to project existing, create non existing**

Existing:

In [20]:
# From the record linkage, we get the URIs that we want to create (or add to the project)
record_linkage_uri_bhp = record_linkage.merge(uris, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])

# Filter already existing URIs:
uris_gv = persons[['pk_bhp', 'pk_person', 'pk_uri', 'uri', 'uri_pk_statement_1', 'uri_pk_statement_2', 'uri_pk_appellation']].dropna().drop_duplicates()
uris_add_to_project = record_linkage_uri_bhp.merge(uris_gv.drop(columns=['pk_bhp', 'pk_person']), on='uri', how='inner')

# Add info to project
db.info_proj_rels.create(uris_add_to_project['pk_uri'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_statement_1'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_statement_2'].astype(int).tolist())
db.info_proj_rels.create(uris_add_to_project['uri_pk_appellation'].astype(int).tolist())

Creating info_proj_rel of 1373 entities with project <6857901> ... Done in [00h00'01]
Creating info_proj_rel of 1373 entities with project <6857901> ... Done in [00h00'00]
Creating info_proj_rel of 1373 entities with project <6857901> ... Done in [00h00'01]
Creating info_proj_rel of 1373 entities with project <6857901> ... Done in [00h00'00]


<IntegerArray>
[2188346,  784464,  784870,  784889,  784908,  300539,  785943,  786900,
  787162,  787225,  784001,  784070,  784397,  783808,  783825,  785399,
  785490,  785357,  785378,  785478,  786014,  786027,  786664,  785590,
  786156,  786384,  786479,  786517,  786854,  784036,  786851,  787036]
Length: 32, dtype: Int64

Non existing:

In [21]:
uri_existing = uris_add_to_project['uri'].tolist()
uris_to_create = record_linkage_uri_bhp[[u not in uri_existing for u in record_linkage_uri_bhp['uri']]]

graphs.add_uris(
    uris_to_create['pk_gv'].astype(int).tolist(),
    uris_to_create['uri'].astype(str).tolist()
)

Creating 1313 resources of class [967] ... Done in [00h00'01]
Creating info_proj_rel of 1313 entities with project <6857901> ... Done in [00h00'00]
Creating 1313 appellations ... Done in [00h00'01]
Creating info_proj_rel of 1313 entities with project <6857901> ... Done in [00h00'00]
Creating 1313 statements ... Done in [00h00'01]
Creating info_proj_rel of 1313 entities with project <6857901> ... Done in [00h00'00]
Creating 1313 statements ... Done in [00h00'00]
Creating info_proj_rel of 1313 entities with project <6857901> ... Done in [00h00'01]


**Names: add to project existing, create non existing**

In [22]:
names_gv = persons[['pk_person', 'pk_paial', 'pk_name_lang', 'name', 'name_begin_jd', 'name_end_jd', 'name_pk_statement_3', 'name_pk_statement_4', 'name_pk_appellation_2', 'name_pk_statement_5', 'name_pk_statement_6', 'name_pk_tp_begin', 'name_pk_statement_7', 'name_pk_tp_end']]
names_add_to_project = []
names_to_create = []

for i, row in record_linkage.iterrows():
    select_bhp = names[names['pk_actor'] == row['pk_bhp']].drop_duplicates(subset=['name', 'lang'])
    select_gv = names_gv[names_gv['pk_person'] == row['pk_gv']].drop_duplicates(subset=['name', 'pk_name_lang'])

    for j, row_bhp in select_bhp.iterrows(): 
        lang = pks.languages.from_iso_code(row_bhp['lang']) if pd.notna(row_bhp['lang']) else pks.languages.from_iso_code('fra')
        name_bhp = ' '.join(sorted(row_bhp['name'].replace(',', '').lower().split(' '))) + ' - ' + str(lang)
        # print(name_bhp)
        found = False

        if pd.isna(row_bhp['begin']) and pd.isna(row_bhp['end']):
            for k, row_gv in select_gv.iterrows():
                name_gv = ' '.join(sorted(row_gv['name'].replace(',', '').lower().split(' '))) + ' - ' + str(row_gv['pk_name_lang'])
                # print(name_gv)
                if name_bhp == name_gv:
                    found = True
                    names_add_to_project.append({
                        'pk_person': row['pk_gv'],
                        'name': row_bhp['name'],
                        'begin': row_bhp['begin'],
                        'end': row_bhp['end'],
                        'pk_paial': row_gv['pk_paial'],
                        'pk_tp_begin': row_gv['name_pk_tp_begin'],
                        'pk_tp_end': row_gv['name_pk_tp_end'],
                        'stmt3': row_gv['name_pk_statement_3'],
                        'stmt4': row_gv['name_pk_statement_4'],
                        'appe2': row_gv['name_pk_appellation_2'],
                        'stmt5': row_gv['name_pk_statement_5'],
                        'stmt6': row_gv['name_pk_statement_6'],
                        'stmt7': row_gv['name_pk_statement_7'],
                    })
        
        if not found:
            names_to_create.append({
                'pk_person': row['pk_gv'],
                'name': row_bhp['name'],
                'pk_lang': lang,
                'begin': row_bhp['begin'],
                'end': row_bhp['end'],
            })

    # display(select_bhp)
    # display(select_gv)

    
names_to_create = pd.DataFrame(data=names_to_create)
names_add_to_project = pd.DataFrame(data=names_add_to_project)

In [23]:
# Create information - new names
names_to_create['pk_paial'] = graphs.add_person_names(
    names_to_create['pk_person'].astype(int).tolist(),
    names_to_create['name'].astype(str).tolist(),
    names_to_create['pk_lang'].astype(int).tolist(),
    return_pk_paial = True
)

# Create information - new names - begin
names_begin = names_to_create[pd.notna(names_to_create['begin'])].copy()
names_begin['begin'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_begin['begin']]
names_begin.dropna(subset=['pk_paial', 'begin'], inplace=True)
tp_begin = db.time_primitives.create(names_begin['begin'].tolist(), '1 day')
db.statements.create(
    names_begin['pk_paial'].astype(int).tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    tp_begin    
)

# Create information - new names - end
names_end = names_to_create[pd.notna(names_to_create['end'])].copy()
names_end['end'] = [(tuple([int(n.strip()) for n in d[1:-1].split(',')])) for d in names_end['end']]
names_end.dropna(subset=['pk_paial', 'end'], inplace=True)
tp_end = db.time_primitives.create(names_end['end'].tolist(), '1 day')
db.statements.create(
    names_end['pk_paial'].astype(int).tolist(),
    pks.properties.timespan_endOfTheBegin_timePrim,
    tp_end    
)


# Create information - add to project - names
db.info_proj_rels.create(names_add_to_project['stmt3'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['pk_paial'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['stmt4'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['appe2'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project['stmt5'].astype(int).tolist())

# Create information - add to project - begin
names_add_to_project_dates_begin = names_add_to_project.dropna(subset=['pk_tp_begin'])
db.info_proj_rels.create(names_add_to_project_dates_begin['stmt6'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project_dates_begin['pk_tp_begin'].astype(int).tolist())

# Create information - add to project - end
names_add_to_project_dates_end = names_add_to_project.dropna(subset=['pk_tp_end'])
db.info_proj_rels.create(names_add_to_project_dates_end['stmt7'].astype(int).tolist())
db.info_proj_rels.create(names_add_to_project_dates_end['pk_tp_end'].astype(int).tolist())

Creating 1336 resources of class [868] ... Done in [00h00'00]
Creating info_proj_rel of 1336 entities with project <6857901> ... Done in [00h00'01]
Creating 1336 appellations ... Done in [00h00'00]
Creating info_proj_rel of 1336 entities with project <6857901> ... Done in [00h00'00]
Creating 1336 statements ... Done in [00h00'01]
Creating info_proj_rel of 1336 entities with project <6857901> ... Done in [00h00'00]
Creating 1336 statements ... Done in [00h00'00]
Creating info_proj_rel of 1336 entities with project <6857901> ... Done in [00h00'01]
Creating 1336 statements ... Done in [00h00'00]
Creating info_proj_rel of 1336 entities with project <6857901> ... Done in [00h00'00]
Creating 13 time primitives ... Done in [00h00'00]
Creating info_proj_rel of 13 entities with project <6857901> ... Done in [00h00'01]
Creating 13 statements ... Done in [00h00'00]
Creating info_proj_rel of 13 entities with project <6857901> ... Done in [00h00'00]
Creating 8 time primitives ... Done in [00h00'00]

**Gender: add to project existing, create non existing**

In [24]:
record_linkage_gender_bhp = record_linkage.merge(genders, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])
genders_gv = persons[['pk_person', 'pk_gender', 'gender_pk_statement']].drop_duplicates()

genders_to_update = record_linkage_gender_bhp.merge(genders_gv, left_on='pk_gv', right_on='pk_gender', how='left')
genders_to_update['gender'] = genders_to_update['gender'].replace('Male', pks.entities.pk_gender_male)
genders_to_update['gender'] = genders_to_update['gender'].replace('Female', pks.entities.pk_gender_female)

genders_add_to_project = genders_to_update[pd.notna(genders_to_update['pk_gender'])]
genders_to_create = genders_to_update[pd.isna(genders_to_update['pk_gender'])]

# Add existing
if len(genders_add_to_project) > 0: db.info_proj_rels.create(genders_add_to_project['gender_pk_statement'].astype(int).tolist())

# Create new ones
db.statements.create(
    genders_to_create['pk_gv'].astype(int).tolist(),
    pks.properties.person_hasGender_gender,
    genders_to_create['gender'].astype(int).tolist()
)

Creating 1121 statements ... Done in [00h00'01]
Creating info_proj_rel of 1121 entities with project <6857901> ... Done in [00h00'00]


**Definitions**

In [25]:
record_linkage_definition_bhp = record_linkage.merge(definitions, left_on='pk_bhp', right_on='pk_actor', how='inner').drop(columns=['pk_actor'])
record_linkage_definition_bhp['pk_lang'] = [pks.languages.from_iso_code(code) for code in record_linkage_definition_bhp['lang']]
definitions_gv = persons[['pk_person', 'definition', 'pk_def_lang', 'pk_def', 'pk_def_appe', 'pk_def_statement1', 'pk_def_statement2', 'pk_def_statement3']].drop_duplicates().dropna(subset=['definition'])

In [26]:
definition_add_to_project = []
definition_to_create = []

eta.begin(len(record_linkage_definition_bhp), 'Matching definitions')
for i, row_bhp in record_linkage_definition_bhp.iterrows():
    found = False
    for j, row_gv in definitions_gv.iterrows():
         if row_bhp['pk_gv'] == row_gv['pk_person'] and row_bhp['text'] == row_gv['definition'] and row_bhp['pk_lang'] == row_gv['pk_def_lang']:
              found = True
              definition_add_to_project.append({
                    'pk_def': row_gv['pk_def'],
                    'pk_appe': row_gv['pk_def_appe'],
                    'pk_stmt1': row_gv['pk_def_statement1'],
                    'pk_stmt2': row_gv['pk_def_statement2'],
                    'pk_stmt3': row_gv['pk_def_statement3'],
                    'definition': row_gv['definition'],
                    'pk_person': row_gv['pk_person'],
               })
              break
    
    if not found:
     definition_to_create.append({
          'pk_gv':row_bhp['pk_gv'],
          'text': row_bhp['text'],
          'pk_lang': row_bhp['pk_lang'],
     })
     
     eta.iter()
eta.end()

definition_add_to_project = pd.DataFrame(data=definition_add_to_project).drop_duplicates()
definition_to_create = pd.DataFrame(data=definition_to_create).drop_duplicates()

Matching definitions is done - Elapsed: [00h00'31]                   


In [27]:
# Add existing definitions
db.info_proj_rels.create(definition_add_to_project['pk_def'].astype(int).tolist())
db.info_proj_rels.create(definition_add_to_project['pk_appe'].astype(int).tolist())
db.info_proj_rels.create(definition_add_to_project['pk_stmt1'].astype(int).tolist())
db.info_proj_rels.create(definition_add_to_project['pk_stmt2'].astype(int).tolist())
db.info_proj_rels.create(definition_add_to_project['pk_stmt3'].astype(int).tolist())

Creating info_proj_rel of 3 entities with project <6857901> ... Done in [00h00'00]
Creating info_proj_rel of 3 entities with project <6857901> ... Done in [00h00'00]
Creating info_proj_rel of 3 entities with project <6857901> ... Done in [00h00'00]
Creating info_proj_rel of 3 entities with project <6857901> ... Done in [00h00'00]
Creating info_proj_rel of 3 entities with project <6857901> ... Done in [00h00'00]


In [28]:
# Create new definitions
graphs.add_definitions(
    record_linkage_definition_bhp['pk_gv'].astype(int).tolist(),
    record_linkage_definition_bhp['text'].astype(str).tolist(),
    record_linkage_definition_bhp['pk_lang'].astype(int).tolist(),
)

Creating 1252 resources of class [899] ... Done in [00h00'01]
Creating info_proj_rel of 1252 entities with project <6857901> ... Done in [00h00'00]
Creating 1252 appellations ... Done in [00h00'02]
Creating info_proj_rel of 1252 entities with project <6857901> ... Done in [00h00'01]
Creating 1252 statements ... Done in [00h00'00]
Creating info_proj_rel of 1252 entities with project <6857901> ... Done in [00h00'00]
Creating 1252 statements ... Done in [00h00'01]
Creating info_proj_rel of 1252 entities with project <6857901> ... Done in [00h00'00]
Creating 1252 statements ... Done in [00h00'00]
Creating info_proj_rel of 1252 entities with project <6857901> ... Done in [00h00'00]
