In [None]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
import datetime
#import json
#import request
#import duckdb
#import plotly.express as px

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()


env = 'prod'
pk_project = pks.projects.symogih
execute = False
metadata_str = 'collective-actor-correction'
import_manner = 'one-shot'

# Collective Actors Correction

## 1./ Delete everything

In [None]:
# Connect to Geovistory database
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

### 1.0/ Fetch general data

In [None]:
existing_groups = db.query(f"""
    select 
        r1.pk_entity as pk_group,
        a4.string as uri
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = s2.fk_object_info and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.appellation a4 on a4.pk_entity = s3.fk_object_info
    where r1.fk_class = {pks.classes.group}
""")
existing_groups['pk_bhp'] = [string[string.rfind('/') + 1:] for string in existing_groups['uri']]
existing_groups.drop(columns='uri', inplace=True)
existing_groups = existing_groups[existing_groups['pk_bhp'].str.contains('CoAc')]

pk_groups_str = u.get_sql_ready_str(existing_groups['pk_group'])

# a.infos(existing_groups)

#########

existing_formations = db.query(f"""
    select
        r1.pk_entity as pk_formation,
        ipr2.pk_entity as pk_ipr_has_formed_group
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.formation_hasFormed_group} and s2.fk_object_info in {pk_groups_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    where r1.fk_class = {pks.classes.formation}
""")
pk_formations_str = u.get_sql_ready_str(existing_formations['pk_formation'])

# a.infos(existing_formations)

#########

existing_dissolutions = db.query(f"""
    select
        r1.pk_entity as pk_dissolution,
        ipr2.pk_entity as pk_ipr_has_dissolved_group
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.dissolution_dissolved_group} and s2.fk_object_info in {pk_groups_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    where r1.fk_class = {pks.classes.dissolution}
""")
pk_dissolution_str = u.get_sql_ready_str(existing_dissolutions['pk_dissolution'])

# a.infos(existing_dissolutions)

#### 1.1.a/ Delete `has_formed_group` statement

In [None]:
pk_ipr_has_formed_group = u.get_sql_ready_str(existing_formations['pk_ipr_has_formed_group'])
db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_ipr_has_formed_group}
""")

#### 1.1.b/ Delete `has_dissolved_group` statement

In [None]:
pk_ipr_has_dissolved_group = u.get_sql_ready_str(existing_dissolutions['pk_ipr_has_dissolved_group'])
db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_ipr_has_dissolved_group}
""")

### 1.2/ Delete time information

In [None]:
time_properties = f"({pks.properties.timeSpan_atSomeTimeWithin_timePrimitive}, {pks.properties.timespan_beginOfTheBegin_timePrim}, {pks.properties.timespan_endOfTheEnd_timePrim}, {pks.properties.timeSpan_ongoingThroughout_timePrimitive}, {pks.properties.timespan_endOfTheBegin_timePrim}, {pks.properties.timespan_beginOfTheEnd_timePrim})"

#### 1.2.a/ Delete time information for formation

In [None]:
formation_time_stmts = db.query(f"""
    select
        ipr.pk_entity as pk_ipr_time_info_form
    from information.statement s
    inner join projects.info_proj_rel ipr on ipr.fk_entity = s.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    where s.fk_subject_info in {pk_formations_str} and s.fk_property in {time_properties}
""")
pk_ipr_time_info_form = u.get_sql_ready_str(formation_time_stmts['pk_ipr_time_info_form'])

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_ipr_time_info_form}
""")

#### 1.2.b/ Delete time information for dissolution

In [None]:
dissolution_time_stmts = db.query(f"""
    select
        ipr.pk_entity as pk_ipr_time_info_diss
    from information.statement s
    inner join projects.info_proj_rel ipr on ipr.fk_entity = s.pk_entity and ipr.fk_project = {pk_project} and ipr.is_in_project = true
    where s.fk_subject_info in {pk_dissolution_str} and s.fk_property in {time_properties}
""")
pk_ipr_time_info_diss = u.get_sql_ready_str(dissolution_time_stmts['pk_ipr_time_info_diss'])

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_ipr_time_info_diss}
""")

### 1.3/ Delete certainty comment

#### 1.3.a/ Delete certainty comment for formations

In [None]:
formation_comments = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_formations_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 7953586
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_comment_form = formation_comments['pk_ipr_comment'].tolist() + formation_comments['pk_ipr_stmt_has_comment'].tolist() + formation_comments['pk_ipr_stmt_comment_has_type'].tolist() + formation_comments['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_comment_form = u.get_sql_ready_str(pk_iprs_cert_comment_form)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_comment_form}
""")

#### 1.3.b/ Delete certainty comment for dissolutions

In [None]:
dissolution_comments = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_dissolution_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 7953586
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_comment_diss = dissolution_comments['pk_ipr_comment'].tolist() + dissolution_comments['pk_ipr_stmt_has_comment'].tolist() + dissolution_comments['pk_ipr_stmt_comment_has_type'].tolist() + dissolution_comments['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_comment_diss = u.get_sql_ready_str(pk_iprs_cert_comment_diss)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_comment_diss}
""")

### 1.4/ Delete URI informations

#### 1.4.a/ Delete URI informations for formations

In [None]:
formation_uri = db.query(f"""
    select
        ipr1.pk_entity as pk_ipr_uri,
        ipr2.pk_entity as pk_ipr_stmt_same_as_uri,
        ipr3.pk_entity as pk_ipr_stmt_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI} and s2.fk_subject_info in {pk_formations_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    where r1.fk_class = {pks.classes.uri}
""")

pk_iprs_uri_form = formation_uri['pk_ipr_uri'].tolist() + formation_uri['pk_ipr_stmt_same_as_uri'].tolist() + formation_uri['pk_ipr_stmt_has_value'].tolist()
pk_iprs_uri_form = u.get_sql_ready_str(pk_iprs_uri_form)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_uri_form}
""")

#### 1.4.b/ Delete URI informations for dissolution

In [None]:
dissolution_uri = db.query(f"""
    select
        ipr1.pk_entity as pk_ipr_uri,
        ipr2.pk_entity as pk_ipr_stmt_same_as_uri,
        ipr3.pk_entity as pk_ipr_stmt_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI} and s2.fk_subject_info in {pk_dissolution_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    where r1.fk_class = {pks.classes.uri}
""")

pk_iprs_uri_diss = dissolution_uri['pk_ipr_uri'].tolist() + dissolution_uri['pk_ipr_stmt_same_as_uri'].tolist() + dissolution_uri['pk_ipr_stmt_has_value'].tolist()
pk_iprs_uri_diss = u.get_sql_ready_str(pk_iprs_uri_diss)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_uri_diss}
""")

### 1.5/ Delete date complements

#### 1.5.a/ Delete date complements on formations

In [None]:
formation_date_cmplt = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_formations_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 8065621
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_date_cmplt_form = formation_date_cmplt['pk_ipr_comment'].tolist() + formation_date_cmplt['pk_ipr_stmt_has_comment'].tolist() + formation_date_cmplt['pk_ipr_stmt_comment_has_type'].tolist() + formation_date_cmplt['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_date_cmplt_form = u.get_sql_ready_str(pk_iprs_cert_date_cmplt_form)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_date_cmplt_form}
""")

#### 1.5.b/ Delete date complements on dissolution

In [None]:
dissolution_date_cmplt = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_dissolution_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 8065621
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_date_cmplt_diss = dissolution_date_cmplt['pk_ipr_comment'].tolist() + dissolution_date_cmplt['pk_ipr_stmt_has_comment'].tolist() + dissolution_date_cmplt['pk_ipr_stmt_comment_has_type'].tolist() + dissolution_date_cmplt['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_date_cmplt_diss = u.get_sql_ready_str(pk_iprs_cert_date_cmplt_diss)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_date_cmplt_diss}
""")

### 1.6/ Delete note on dates

#### 1.6.a/ Delete note on dates on formations

In [None]:
formation_note_dates = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_formations_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 8065632
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_note_dates_form = formation_note_dates['pk_ipr_comment'].tolist() + formation_note_dates['pk_ipr_stmt_has_comment'].tolist() + formation_note_dates['pk_ipr_stmt_comment_has_type'].tolist() + formation_note_dates['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_note_dates_form = u.get_sql_ready_str(pk_iprs_cert_note_dates_form)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_note_dates_form}
""")

#### 1.6.b/ Delete note on dates on dissolution

In [None]:
dissolution_note_dates = db.query(f"""
    select
        r1.pk_entity as pk_comment,
        ipr1.pk_entity as pk_ipr_comment,
        ipr2.pk_entity as pk_ipr_stmt_has_comment,
        ipr3.pk_entity as pk_ipr_stmt_comment_has_type,
        ipr4.pk_entity as pk_ipr_stmt_comment_has_value
    from information.resource r1
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_object_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_hasComment_text} and s2.fk_subject_info in {pk_dissolution_str}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.fk_project = {pk_project} and ipr2.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = r1.pk_entity and s3.fk_property = {pks.properties.comment_hasCommentType_CommentType} and s3.fk_object_info = 8065632
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.statement s4 on s4.fk_subject_info = r1.pk_entity and s4.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.fk_project = {pk_project} and ipr4.is_in_project = true
    where r1.fk_class = {pks.classes.comment}
""")

pk_iprs_cert_note_dates_diss = dissolution_note_dates['pk_ipr_comment'].tolist() + dissolution_note_dates['pk_ipr_stmt_has_comment'].tolist() + dissolution_note_dates['pk_ipr_stmt_comment_has_type'].tolist() + dissolution_note_dates['pk_ipr_stmt_comment_has_value'].tolist()
pk_iprs_cert_note_dates_diss = u.get_sql_ready_str(pk_iprs_cert_note_dates_diss)

db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_iprs_cert_note_dates_diss}
""")

### 1.7/ Delete Formation, Dissolutions

In [None]:
db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_formations_str}
""")
db.execute(f"""
    update projects.info_proj_rel 
        set is_in_project = false
        where pk_entity in {pk_dissolution_str}
""")

## 2./ Find and create missing Groups

### 2.1/ Find all existing groups in symogih project in Geovistory

In [None]:
# Connect to Geovistory database
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

In [None]:
# Find existing group on GV
groups = db.query(f"""
    select
        r1.pk_entity as pk_group,
        a4.string as uri
    from information.resource r1 
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr2a on ipr2a.fk_entity = s2.pk_entity and ipr2a.fk_project = {pk_project} and ipr2a.is_in_project = true
    inner join projects.info_proj_rel ipr2b on ipr2b.fk_entity = s2.fk_object_info and ipr2b.fk_project = {pk_project} and ipr2b.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = s2.fk_object_info and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.appellation a4 on a4.pk_entity = s3.fk_object_info
    where r1.fk_class = {pks.classes.group}
""")
groups = groups[groups['uri'].str.contains('CoAc')]
groups['pk_coac'] = [string[string.rfind('/') + 1:] for string in groups['uri']]
groups.drop(columns='uri', inplace=True)

# a.infos(groups)

### 2.2/ Find all Collective actors from BHP (CSV file)

In [None]:
coac = u.read_df('../../data/bhp/collective-actor.csv', skip_info=True).rename(columns={'notes':'notes_coac', 'begin_year':'begin_year_coac', 'end_year':'end_year_coac'}).drop(columns=['concat_standard_name'])
coac_name = u.read_df('../../data/bhp/collective-actor-name.csv', skip_info=True).rename(columns={'notes':'notes_name', 'lang_iso':'lang_name', 'comment_begin_year':'comment_begin_year_name', 'comment_end_year':'comment_end_year_name', 'begin_date':'begin_date_name', 'end_date':'end_date_name'})
coac_text_property = u.read_df('../../data/bhp/collective-actor-text-property.csv', skip_info=True).rename(columns={'notes':'notes_text_prop', 'lang_iso_code':'lang_text_prop'})

coacs = coac.merge(coac_name, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_name'])
coacs = coacs.merge(coac_text_property, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_text_property'])
coacs['begin_date_name'] = [u.parse_tuple_date(d) for d in coacs['begin_date_name']]
coacs['end_date_name'] = [u.parse_tuple_date(d) for d in coacs['end_date_name']]

coacs['pk_collective_actor'] = 'CoAc' + coacs['pk_collective_actor'].astype(str)
coacs.rename(columns={'pk_collective_actor':'pk_coac'}, inplace=True)

# a.infos(coacs)

### 2.3/ Get record linkage result (CSV file)

In [None]:
record_linkage = pd.read_csv('../../data/record-linkage-collective-actors-filled.csv')
record_linkage = record_linkage[record_linkage['Doublon'] == 'oui']
record_linkage = record_linkage[['pk_bhp', 'pk_gv']]
record_linkage['pk_bhp'] = 'CoAc' + record_linkage['pk_bhp'].astype(str)
record_linkage.rename(columns={'pk_bhp': 'pk_coac', 'pk_gv': 'pk_group'}, inplace=True)

# a.infos(record_linkage)

### 2.4/ Find missing ones

In [None]:
existing_groups = groups['pk_coac'].tolist()
record_linkage_groups = record_linkage['pk_coac'].tolist()
missing_groups = coacs[[pk_coac not in existing_groups and pk_coac not in record_linkage_groups for pk_coac in coacs['pk_coac']]].reset_index(drop=True)

# All missing ones should be either existing or in the record linkage
assert len(missing_groups) == 0

### 2.5/ Add record linkage result to project

The problem was actually that all information about the entities where added, but not the entity itself.. Dumb me..

In [None]:
db.info_proj_rels.create(record_linkage['pk_group'])

## 3./ Add Formations

### 3.1/ Fetch local file about coacs

In [None]:
coac = u.read_df('../../data/bhp/collective-actor.csv', skip_info=True).rename(columns={'notes':'notes_coac', 'begin_year':'begin_year_coac', 'end_year':'end_year_coac'}).drop(columns=['concat_standard_name'])
coac_name = u.read_df('../../data/bhp/collective-actor-name.csv', skip_info=True).rename(columns={'notes':'notes_name', 'lang_iso':'lang_name', 'comment_begin_year':'comment_begin_year_name', 'comment_end_year':'comment_end_year_name', 'begin_date':'begin_date_name', 'end_date':'end_date_name'})
coac_text_property = u.read_df('../../data/bhp/collective-actor-text-property.csv', skip_info=True).rename(columns={'notes':'notes_text_prop', 'lang_iso_code':'lang_text_prop'})

coacs = coac.merge(coac_name, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_name'])
coacs = coacs.merge(coac_text_property, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_text_property'])
coacs['begin_date_name'] = [u.parse_tuple_date(d) for d in coacs['begin_date_name']]
coacs['end_date_name'] = [u.parse_tuple_date(d) for d in coacs['end_date_name']]

coacs['pk_collective_actor'] = 'CoAc' + coacs['pk_collective_actor'].astype(str)
coacs.rename(columns={'pk_collective_actor':'pk_coac'}, inplace=True)

# a.infos(coacs)

### 3.2/ Fetch Geovistory existing groups

In [None]:
# Connect to Geovistory database
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

In [None]:
# Find existing group on GV
groups = db.query(f"""
    select
        r1.pk_entity as pk_group,
        a4.string as uri
    from information.resource r1 
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr2a on ipr2a.fk_entity = s2.pk_entity and ipr2a.fk_project = {pk_project} and ipr2a.is_in_project = true
    inner join projects.info_proj_rel ipr2b on ipr2b.fk_entity = s2.fk_object_info and ipr2b.fk_project = {pk_project} and ipr2b.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = s2.fk_object_info and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.appellation a4 on a4.pk_entity = s3.fk_object_info
    where r1.fk_class = {pks.classes.group}
""")
groups = groups[groups['uri'].str.contains('CoAc')]
groups['pk_coac'] = [string[string.rfind('/') + 1:] for string in groups['uri']]
groups.drop(columns='uri', inplace=True)

coacs_str = u.get_sql_ready_str(groups['pk_coac'])

# a.infos(groups)


### 3.3/ Fetch BHP related formation

In [None]:
# Connect to BHP database
db_url_env_var_name = 'YELLOW_BHP' # Name of an environment variable holding the Postgres database URL
db.connect_external(os.getenv(db_url_env_var_name), execute=False)

In [None]:
formations_bhp_info = db.query(f"""
    select
        ir.fk_associated_object as pk_coac, 
        i.pk_information, 
        id.year, id.month, id.day,
        id.fk_abob_type_information_date,
        id.complement as complement, 
        id.notes as notes,
        id.certainty_date
    from bhp.information_role ir
    inner join bhp.information i on i.pk_information = ir.fk_information and i.fk_type_information = 30
    inner join bhp.information_date id on id.fk_information = i.pk_information
    where ir.fk_associated_object in {coacs_str} and ir.fk_type_role = 49
""")

formations_bhp_info['year'] = formations_bhp_info['year'].astype(pd.Int64Dtype())
formations_bhp_info['month'] = formations_bhp_info['month'].astype(pd.Int64Dtype())
formations_bhp_info['day'] = formations_bhp_info['day'].astype(pd.Int64Dtype())
formations_bhp_info['fk_abob_type_information_date'] = formations_bhp_info['fk_abob_type_information_date'].astype(pd.Int64Dtype())
formations_bhp_info['date_bhp'] = [(row.year, row.month, row.day) for i, row in formations_bhp_info.iterrows()]
formations_bhp_info['uri'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in formations_bhp_info['pk_information']]
formations_bhp_info.drop(columns=['year', 'month', 'day', 'pk_information'], inplace=True)
formations_bhp_info['complement'] = [pd.NA if pd.isna(row['complement']) or row['complement'].strip() == '' else row['complement'] for _,row in formations_bhp_info.iterrows()]
formations_bhp_info['notes'] = [pd.NA if pd.isna(row['notes']) or row['notes'].strip() == '' else row['notes'] for _,row in formations_bhp_info.iterrows()]
formations_bhp_info['notes'] = [s.replace('<p>', '').replace('</p>', '') if pd.notna(s) else pd.NA for s in formations_bhp_info['notes']]
formations_bhp_info['notes'] = [s.replace('<em>', '').replace('</em>', '') if pd.notna(s) else pd.NA for s in formations_bhp_info['notes']]
formations_bhp_info['complement'] = [e.replace('<p>', '').replace('</p>', '') if pd.notna(e) else pd.NA for e in formations_bhp_info['complement']]

# a.infos(formations_bhp_info)

### 3.4/ Build import table

#### 3.4.1/ Create table, and link with GV id

In [None]:
formations = pd.DataFrame()
formations['pk_coac'] = np.unique(coacs['pk_coac'].tolist() + formations_bhp_info['pk_coac'].tolist())
formations = formations.merge(groups)

formations.sort_values('pk_coac', inplace=True)
formations = formations[['pk_group', 'pk_coac']]

# a.infos(formations)

#### 3.4.2/ Add date informations

In [None]:
### Begin of the begin ###
note_begin = 4
property_name = 'begin_of_the_begin'
abob_types = [1125,1321,1322]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'begin_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['begin_year_coac']):
            formations.at[row['pk_coac'], property_name] = (row['begin_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
### Begin of the end ###
property_name = 'begin_of_the_end'
abob_types = [1290]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
### End of the begin ###
property_name = 'end_of_the_begin'
abob_types = [1323]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
### End of the end ###
note_begin = 1
property_name = 'end_of_the_end'
abob_types = [256,1126,1128]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'begin_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['begin_year_coac']):
            formations.at[row['pk_coac'], property_name] = (row['begin_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
### Ongoing throughout ###
note_begin = 3
property_name = 'ongoing_throughout'
abob_types = [258]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'begin_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['begin_year_coac']):
            formations.at[row['pk_coac'], property_name] = (row['begin_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
### At some time within ###
note_begin = 2
property_name = 'at_some_time_within'
abob_types = [246]

formations[property_name] = pd.NA
formations.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'begin_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['begin_year_coac']):
            formations.at[row['pk_coac'], property_name] = (row['begin_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = formations_bhp_info[formations_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        formations.at[row['pk_coac'], property_name] = row['date_bhp']

formations.reset_index(inplace=True)

# a.infos(formations)

In [None]:
# Default cases: At some time within

formations.set_index('pk_coac', inplace=True)

# From CSV file
for i, row in coacs.iterrows():
    if pd.isna(row['notes_begin']) or row['notes_begin'] not in [1,2,3,4]:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            if pd.notna(row['begin_year_coac']):
                formations.at[row['pk_coac'], 'at_some_time_within'] = (row['begin_year_coac'], pd.NA, pd.NA)

# From BHP infos
for i, row in formations_bhp_info.iterrows():
    if row['fk_abob_type_information_date'] not in [246,1125,1126,258,1289,1290,1321,1322,1323,256,1128,1128]:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            if row['date_bhp'] != (pd.NA, pd.NA, pd.NA):
                formations.at[row['pk_coac'], 'at_some_time_within'] = row['date_bhp']        

formations.reset_index(inplace=True)

# a.infos(formations)

#### 3.4.3/ Add URI

In [None]:
# Only concerns the BHP informations

formations['uri'] = pd.NA
formations.set_index('pk_coac', inplace=True)

for _, row in formations_bhp_info.iterrows():
    formations.at[row['pk_coac'], 'uri'] = row['uri']

formations.reset_index(inplace=True)

# a.infos(formations)

#### 3.4.4/ Add certainty comment

In [None]:
formations['certainty_comment'] = pd.NA

formations.set_index('pk_coac', inplace=True)

# From CSV file
for _, row in coacs.iterrows():
    if pd.notna(row['certainty_begin']) and row['certainty_begin'] == 2:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            formations.at[row['pk_coac'], 'certainty_comment'] = "Date reconstituée"
    elif pd.notna(row['certainty_begin']) and row['certainty_begin'] == 3:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            formations.at[row['pk_coac'], 'certainty_comment'] = "Date postulée"

# From BHP info
for _, row in formations_bhp_info.iterrows():
    if pd.notna(row['certainty_date']) and row['certainty_date'] == 2:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            formations.at[row['pk_coac'], 'certainty_comment'] = "Date reconstituée"
    elif pd.notna(row['certainty_date']) and row['certainty_date'] == 3:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            formations.at[row['pk_coac'], 'certainty_comment'] = "Date postulée"

formations.reset_index(inplace=True)

# a.infos(formations)

#### 3.4.5/ Add Date complement

In [None]:
formations['date_complement'] = pd.NA

formations.set_index('pk_coac', inplace=True)

# From BHP info
for _, row in formations_bhp_info.iterrows():
    formations.at[row['pk_coac'], 'date_complement'] = row['complement']
    
formations.reset_index(inplace=True)

# a.infos(formations)

#### 3.4.6/ Add date note

In [None]:
formations['date_note'] = pd.NA

formations.set_index('pk_coac', inplace=True)

# From BHP info
for _, row in formations_bhp_info.iterrows():
    formations.at[row['pk_coac'], 'date_note'] = '[Note] ' + row['notes']
    
formations.reset_index(inplace=True)

# a.infos(formations)

### 3.5/ Remove formation that should not be created

In [None]:
formations = formations[
    pd.notna(formations['begin_of_the_begin']) |
    pd.notna(formations['begin_of_the_end']) |
    pd.notna(formations['end_of_the_begin']) |
    pd.notna(formations['end_of_the_end']) |
    pd.notna(formations['ongoing_throughout']) |
    pd.notna(formations['at_some_time_within']) |
    pd.notna(formations['uri']) |
    pd.notna(formations['certainty_comment']) |
    pd.notna(formations['date_complement']) |
    pd.notna(formations['date_note'])
].reset_index(drop=True)

# a.infos(formations)

### 3.6/ Generate CSV for validation

In [None]:
formations.to_csv('./formation.csv', index=False)

a.infos(formations)

## 4./ Add Dissolutions

### 4.1/ Fetch local file about coacs

In [None]:
coac = u.read_df('../../data/bhp/collective-actor.csv', skip_info=True).rename(columns={'notes':'notes_coac', 'begin_year':'begin_year_coac', 'end_year':'end_year_coac'}).drop(columns=['concat_standard_name'])
coac_name = u.read_df('../../data/bhp/collective-actor-name.csv', skip_info=True).rename(columns={'notes':'notes_name', 'lang_iso':'lang_name', 'comment_begin_year':'comment_begin_year_name', 'comment_end_year':'comment_end_year_name', 'begin_date':'begin_date_name', 'end_date':'end_date_name'})
coac_text_property = u.read_df('../../data/bhp/collective-actor-text-property.csv', skip_info=True).rename(columns={'notes':'notes_text_prop', 'lang_iso_code':'lang_text_prop'})

coacs = coac.merge(coac_name, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_name'])
coacs = coacs.merge(coac_text_property, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_text_property'])
coacs['begin_date_name'] = [u.parse_tuple_date(d) for d in coacs['begin_date_name']]
coacs['end_date_name'] = [u.parse_tuple_date(d) for d in coacs['end_date_name']]

coacs['pk_collective_actor'] = 'CoAc' + coacs['pk_collective_actor'].astype(str)
coacs.rename(columns={'pk_collective_actor':'pk_coac'}, inplace=True)

# a.infos(coacs)

### 4.2/ Fetch Geovistory existing groups

In [None]:
# Connect to Geovistory database
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

In [None]:
# Find existing group on GV
groups = db.query(f"""
    select
        r1.pk_entity as pk_group,
        a4.string as uri
    from information.resource r1 
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = r1.pk_entity and ipr1.fk_project = {pk_project} and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = r1.pk_entity and s2.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr2a on ipr2a.fk_entity = s2.pk_entity and ipr2a.fk_project = {pk_project} and ipr2a.is_in_project = true
    inner join projects.info_proj_rel ipr2b on ipr2b.fk_entity = s2.fk_object_info and ipr2b.fk_project = {pk_project} and ipr2b.is_in_project = true
    inner join information.statement s3 on s3.fk_subject_info = s2.fk_object_info and s3.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr3 on ipr3.fk_entity = s3.pk_entity and ipr3.fk_project = {pk_project} and ipr3.is_in_project = true
    inner join information.appellation a4 on a4.pk_entity = s3.fk_object_info
    where r1.fk_class = {pks.classes.group}
""")
groups = groups[groups['uri'].str.contains('CoAc')]
groups['pk_coac'] = [string[string.rfind('/') + 1:] for string in groups['uri']]
groups.drop(columns='uri', inplace=True)

coacs_str = u.get_sql_ready_str(groups['pk_coac'])

# a.infos(groups)


### 4.3/ Fetch BHP related formation

In [None]:
# Connect to BHP database
db_url_env_var_name = 'YELLOW_BHP' # Name of an environment variable holding the Postgres database URL
db.connect_external(os.getenv(db_url_env_var_name), execute=False)

In [None]:
dissolutions_bhp_info = db.query(f"""
    select
        ir.fk_associated_object as pk_coac, 
        i.pk_information, 
        id.year, id.month, id.day,
        id.fk_abob_type_information_date,
        id.complement as complement, 
        id.notes as notes,
        id.certainty_date
    from bhp.information_role ir
    inner join bhp.information i on i.pk_information = ir.fk_information and i.fk_type_information = 33
    inner join bhp.information_date id on id.fk_information = i.pk_information
    where ir.fk_associated_object in {coacs_str} and ir.fk_type_role = 54
""")

dissolutions_bhp_info['year'] = dissolutions_bhp_info['year'].astype(pd.Int64Dtype())
dissolutions_bhp_info['month'] = dissolutions_bhp_info['month'].astype(pd.Int64Dtype())
dissolutions_bhp_info['day'] = dissolutions_bhp_info['day'].astype(pd.Int64Dtype())
dissolutions_bhp_info['fk_abob_type_information_date'] = dissolutions_bhp_info['fk_abob_type_information_date'].astype(pd.Int64Dtype())
dissolutions_bhp_info['date_bhp'] = [(row.year, row.month, row.day) for i, row in dissolutions_bhp_info.iterrows()]
dissolutions_bhp_info['uri'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in dissolutions_bhp_info['pk_information']]
dissolutions_bhp_info.drop(columns=['year', 'month', 'day', 'pk_information'], inplace=True)
dissolutions_bhp_info['complement'] = [pd.NA if pd.isna(row['complement']) or row['complement'].strip() == '' else row['complement'] for _,row in dissolutions_bhp_info.iterrows()]
dissolutions_bhp_info['notes'] = [pd.NA if pd.isna(row['notes']) or row['notes'].strip() == '' else row['notes'] for _,row in dissolutions_bhp_info.iterrows()]
dissolutions_bhp_info['complement'] = [e.replace('<p>', '').replace('</p>', '') if pd.notna(e) else pd.NA for e in dissolutions_bhp_info['complement']]

a.infos(dissolutions_bhp_info)

### 4.4/ Build import table

#### 4.4.1/ Create table, and link with GV id

In [None]:
dissolutions = pd.DataFrame()
dissolutions['pk_coac'] = np.unique(coacs['pk_coac'].tolist() + dissolutions_bhp_info['pk_coac'].tolist())
dissolutions = dissolutions.merge(groups)

dissolutions.sort_values('pk_coac', inplace=True)
dissolutions = dissolutions[['pk_group', 'pk_coac']]

# a.infos(dissolutions)

#### 4.4.2/ Add date informations

In [None]:
### Begin of the begin ###
note_begin = 4
property_name = 'begin_of_the_begin'
abob_types = [1125,1321,1322]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'end_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['end_year_coac']):
            dissolutions.at[row['pk_coac'], property_name] = (row['end_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
### Begin of the end ###
property_name = 'begin_of_the_end'
abob_types = [1290]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
### End of the begin ###
property_name = 'end_of_the_begin'
abob_types = [1323]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
### End of the end ###
note_begin = 1
property_name = 'end_of_the_end'
abob_types = [256,1126,1128]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'end_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['end_year_coac']):
            dissolutions.at[row['pk_coac'], property_name] = (row['end_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
### Ongoing throughout ###
note_begin = 3
property_name = 'ongoing_throughout'
abob_types = [258]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'end_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['end_year_coac']):
            dissolutions.at[row['pk_coac'], property_name] = (row['end_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
### At some time within ###
note_begin = 2
property_name = 'at_some_time_within'
abob_types = [246]

dissolutions[property_name] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
selection = coacs[coacs['notes_begin'] == note_begin][['pk_coac', 'end_year_coac']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        if pd.notna(row['end_year_coac']):
            dissolutions.at[row['pk_coac'], property_name] = (row['end_year_coac'], pd.NA, pd.NA)

# From BHP infomation (has prio over CSV)
selection = dissolutions_bhp_info[dissolutions_bhp_info['fk_abob_type_information_date'].isin(abob_types)][['pk_coac', 'date_bhp']]
for _, row in selection.iterrows():
    if row['pk_coac'] in groups['pk_coac'].tolist():
        dissolutions.at[row['pk_coac'], property_name] = row['date_bhp']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

In [None]:
# Default cases: At some time within

dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
for i, row in coacs.iterrows():
    if pd.isna(row['notes_begin']) or row['notes_begin'] not in [1,2,3,4]:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            if pd.notna(row['end_year_coac']):
                dissolutions.at[row['pk_coac'], 'at_some_time_within'] = (row['end_year_coac'], pd.NA, pd.NA)

# From BHP infos
for i, row in dissolutions_bhp_info.iterrows():
    if row['fk_abob_type_information_date'] not in [246,1125,1126,258,1289,1290,1321,1322,1323,256,1128,1128]:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            if row['date_bhp'] != (pd.NA, pd.NA, pd.NA):
                dissolutions.at[row['pk_coac'], 'at_some_time_within'] = row['date_bhp']        

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

#### 4.4.3/ Add URI

In [None]:
# Only concerns the BHP informations

dissolutions['uri'] = pd.NA
dissolutions.set_index('pk_coac', inplace=True)

for _, row in dissolutions_bhp_info.iterrows():
    dissolutions.at[row['pk_coac'], 'uri'] = row['uri']

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

#### 4.4.4/ Add certainty comment

In [None]:
dissolutions['certainty_comment'] = pd.NA

dissolutions.set_index('pk_coac', inplace=True)

# From CSV file
for _, row in coacs.iterrows():
    if pd.notna(row['certainty_begin']) and row['certainty_begin'] == 2:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            dissolutions.at[row['pk_coac'], 'certainty_comment'] = "Date reconstituée"
    elif pd.notna(row['certainty_begin']) and row['certainty_begin'] == 3:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            dissolutions.at[row['pk_coac'], 'certainty_comment'] = "Date postulée"

# From BHP info
for _, row in dissolutions_bhp_info.iterrows():
    if pd.notna(row['certainty_date']) and row['certainty_date'] == 2:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            dissolutions.at[row['pk_coac'], 'certainty_comment'] = "Date reconstituée"
    elif pd.notna(row['certainty_date']) and row['certainty_date'] == 3:
        if row['pk_coac'] in groups['pk_coac'].tolist():
            dissolutions.at[row['pk_coac'], 'certainty_comment'] = "Date postulée"

dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

#### 4.4.5/ Add Date complement

In [None]:
dissolutions['date_complement'] = pd.NA

dissolutions.set_index('pk_coac', inplace=True)

# From BHP info
for _, row in dissolutions_bhp_info.iterrows():
    dissolutions.at[row['pk_coac'], 'date_complement'] = row['complement']
    
dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

#### 4.4.6/ Add date note

In [None]:
dissolutions['date_note'] = pd.NA

dissolutions.set_index('pk_coac', inplace=True)

# From BHP info
for _, row in dissolutions_bhp_info.iterrows():
    dissolutions.at[row['pk_coac'], 'date_note'] = '[Note] ' + row['notes']
    
dissolutions.reset_index(inplace=True)

# a.infos(dissolutions)

### 4.5/ Remove dissolutions that should not be created

In [None]:
dissolutions = dissolutions[
    pd.notna(dissolutions['begin_of_the_begin']) |
    pd.notna(dissolutions['begin_of_the_end']) |
    pd.notna(dissolutions['end_of_the_begin']) |
    pd.notna(dissolutions['end_of_the_end']) |
    pd.notna(dissolutions['ongoing_throughout']) |
    pd.notna(dissolutions['at_some_time_within']) |
    pd.notna(dissolutions['uri']) |
    pd.notna(dissolutions['certainty_comment']) |
    pd.notna(dissolutions['date_complement']) |
    pd.notna(dissolutions['date_note'])
].reset_index(drop=True)

# a.infos(dissolutions)

### 4.6/ Generate CSV for validation

In [None]:
dissolutions.to_csv('./dissolutions.csv', index=False)

a.infos(dissolutions)

## 5/ Import new data

In [None]:
# Connect to Geovistory database
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

In [None]:
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA

### 5.1/ Create formations / dissolutions

In [None]:
formations["pk_formation"] = db.resources.create(pks.classes.formation, len(formations))
dissolutions["pk_dissolution"] = db.resources.create(pks.classes.dissolution, len(dissolutions))

### 5.2/ Link formations / dissolutions to their group

In [None]:
db.statements.create(formations['pk_formation'], pks.properties.formation_hasFormed_group, formations['pk_group'])
db.statements.create(dissolutions['pk_dissolution'], pks.properties.dissolution_dissolved_group, dissolutions['pk_group'])

### 5.3/ Begin of the begin

In [None]:
selection = formations[pd.notna(formations['begin_of_the_begin'])]
durations = [get_duration(date) for date in selection['begin_of_the_begin']]
time_prims = db.time_primitives.create(selection['begin_of_the_begin'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timespan_beginOfTheBegin_timePrim, time_prims)

selection = dissolutions[pd.notna(dissolutions['begin_of_the_begin'])]
durations = [get_duration(date) for date in selection['begin_of_the_begin']]
time_prims = db.time_primitives.create(selection['begin_of_the_begin'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timespan_beginOfTheBegin_timePrim, time_prims)

### 5.4/ Begin of the end

In [None]:
selection = formations[pd.notna(formations['begin_of_the_end'])]
durations = [get_duration(date) for date in selection['begin_of_the_end']]
time_prims = db.time_primitives.create(selection['begin_of_the_end'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timespan_beginOfTheEnd_timePrim, time_prims)

selection = dissolutions[pd.notna(dissolutions['begin_of_the_end'])]
durations = [get_duration(date) for date in selection['begin_of_the_end']]
time_prims = db.time_primitives.create(selection['begin_of_the_end'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timespan_beginOfTheEnd_timePrim, time_prims)

### 5.5/ End of the begin

In [None]:
selection = formations[pd.notna(formations['end_of_the_begin'])]
durations = [get_duration(date) for date in selection['end_of_the_begin']]
time_prims = db.time_primitives.create(selection['end_of_the_begin'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timespan_endOfTheBegin_timePrim, time_prims)

selection = dissolutions[pd.notna(dissolutions['end_of_the_begin'])]
durations = [get_duration(date) for date in selection['end_of_the_begin']]
time_prims = db.time_primitives.create(selection['end_of_the_begin'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timespan_endOfTheBegin_timePrim, time_prims)

### 5.6/ End of the end

In [None]:
selection = formations[pd.notna(formations['end_of_the_end'])]
durations = [get_duration(date) for date in selection['end_of_the_end']]
time_prims = db.time_primitives.create(selection['end_of_the_end'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timespan_endOfTheEnd_timePrim, time_prims)

selection = dissolutions[pd.notna(dissolutions['end_of_the_end'])]
durations = [get_duration(date) for date in selection['end_of_the_end']]
time_prims = db.time_primitives.create(selection['end_of_the_end'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timespan_endOfTheEnd_timePrim, time_prims)

### 5.7/ Ongoing throughout

In [None]:
selection = formations[pd.notna(formations['ongoing_throughout'])]
durations = [get_duration(date) for date in selection['ongoing_throughout']]
time_prims = db.time_primitives.create(selection['ongoing_throughout'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timeSpan_ongoingThroughout_timePrimitive, time_prims)

selection = dissolutions[pd.notna(dissolutions['ongoing_throughout'])]
durations = [get_duration(date) for date in selection['ongoing_throughout']]
time_prims = db.time_primitives.create(selection['ongoing_throughout'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timeSpan_ongoingThroughout_timePrimitive, time_prims)

### 5.8/ At some time within

In [None]:
selection = formations[pd.notna(formations['at_some_time_within'])]
durations = [get_duration(date) for date in selection['at_some_time_within']]
time_prims = db.time_primitives.create(selection['at_some_time_within'], durations)
db.statements.create(selection['pk_formation'], pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, time_prims)

selection = dissolutions[pd.notna(dissolutions['at_some_time_within'])]
durations = [get_duration(date) for date in selection['at_some_time_within']]
time_prims = db.time_primitives.create(selection['at_some_time_within'], durations)
db.statements.create(selection['pk_dissolution'], pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, time_prims)

### 5.9/ URI

In [None]:
selection = formations[pd.notna(formations['uri'])]
db.shortcuts.add_uris(selection['pk_formation'], selection['uri'])

selection = dissolutions[pd.notna(dissolutions['uri'])]
db.shortcuts.add_uris(selection['pk_dissolution'], selection['uri'])

### 5.10/ Certainty comment

In [None]:
selection = formations[pd.notna(formations['certainty_comment'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['certainty_comment'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 7953586)
db.statements.create(selection['pk_formation'], pks.properties.entity_hasComment_text, pk_comments)

selection = dissolutions[pd.notna(dissolutions['certainty_comment'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['certainty_comment'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 7953586)
db.statements.create(selection['pk_dissolution'], pks.properties.entity_hasComment_text, pk_comments)

### 5.11/ Date_complement

In [None]:
selection = formations[pd.notna(formations['date_complement'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['date_complement'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 8065621)
db.statements.create(selection['pk_formation'], pks.properties.entity_hasComment_text, pk_comments)

selection = dissolutions[pd.notna(dissolutions['date_complement'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['date_complement'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 8065621)
db.statements.create(selection['pk_dissolution'], pks.properties.entity_hasComment_text, pk_comments)

### 5.11/ Date note

In [None]:
selection = formations[pd.notna(formations['date_note'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['date_note'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 8065632)
db.statements.create(selection['pk_formation'], pks.properties.entity_hasComment_text, pk_comments)

selection = dissolutions[pd.notna(dissolutions['date_note'])]
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['date_note'])
db.statements.create(pk_comments, pks.properties.appe_hasValue_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, 8065632)
db.statements.create(selection['pk_dissolution'], pks.properties.entity_hasComment_text, pk_comments)