In [1]:
# %load ~/Desktop/geovpylib/heading.py
%load_ext autoreload
%autoreload 2

env = 'stag'
pk_project = 11850066
execute = True
metadata_str = 'collective-actors'
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.queries as q
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# db.connect_external(os.getenv(''))
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

[DB] Connecting to STAGING Database ... Connected!


# Import collective actors

## Fetch data

### BHP infos

In [2]:
db.connect_external(os.environ.get('YELLOW_BHP'))

[DB] Connecting to PGSQL Database ... Connected!


In [3]:
coac = u.read_df('../../data/bhp/collective-actor.csv').rename(columns={'notes':'notes_coac', 'begin_year':'begin_year_coac', 'end_year':'end_year_coac'}).drop(columns=['concat_standard_name'])
coac_name = u.read_df('../../data/bhp/collective-actor-name.csv').rename(columns={'notes':'notes_name', 'lang_iso':'lang_name', 'comment_begin_year':'comment_begin_year_name', 'comment_end_year':'comment_end_year_name', 'begin_date':'begin_date_name', 'end_date':'end_date_name'})
coac_text_property = u.read_df('../../data/bhp/collective-actor-text-property.csv').rename(columns={'notes':'notes_text_prop', 'lang_iso_code':'lang_text_prop'})

coacs = coac.merge(coac_name, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_name'])
coacs = coacs.merge(coac_text_property, left_on='pk_collective_actor', right_on='fk_collective_actor', how='left').drop(columns=['fk_collective_actor', 'pk_collective_actor_text_property'])
coacs['begin_date_name'] = [u.parse_tuple_date(d) for d in coacs['begin_date_name']]
coacs['end_date_name'] = [u.parse_tuple_date(d) for d in coacs['end_date_name']]

# a.infos(coacs)

# For formation and dissolution
values = '(' + ','.join(["'CoAc" + str(e) + "'" for e in coacs['pk_collective_actor'].unique()]) + ')'

Shape:  (22009, 10) - extract:


Unnamed: 0,pk_collective_actor,notes_begin,certainty_begin,notes_end,certainty_end,notes,fk_abob_type_collective_actor,begin_year,end_year,concat_standard_name
0,14725,2.0,1.0,,,,1051.0,1969.0,,Parti Socialiste (PS)
1,9712,2.0,1.0,2.0,1.0,,1051.0,1945.0,1998.0,Conseil national du patronat français (CNPF)
2,13649,,1.0,,1.0,,,,,Parlement des États de Bourgogne
3,14723,,,,,,1051.0,1477.0,1789.0,Bailliage de Chalon
4,14726,2.0,1.0,,,,1051.0,1971.0,,Université Paris 1 (Panthéon-Sorbonne)


Shape:  (24429, 11) - extract:


Unnamed: 0,pk_collective_actor_name,is_standard_name,name,lang_iso,comment_begin_year,comment_end_year,notes,fk_collective_actor,fk_abob_coac_name_type,begin_date,end_date
0,5,True,Collège des Jésuites de Genova,,,,,10,,,
1,6,True,Collège romain,fra,,,,11,,,
2,8,True,Compagnie de Jésus,fra,,,,13,,,
3,9,True,Congrégation de l'Index,fra,,,,14,,,
4,10,True,Congrégation de l'Inquisition,,,,,15,,,


Shape:  (16687, 6) - extract:


Unnamed: 0,pk_collective_actor_text_property,property_type,lang_iso_code,text,notes,fk_collective_actor
0,8718,notice,ita,Comunità valdesi impiantatesi in Calabria dal ...,,14779
1,8679,notice,fra,Chambre syndicale patronale,,14283
2,5857,notice,fra,Chambre syndicale patronale,,9139
3,8678,notice,fra,Chambre syndicale patronale,,14709
4,8680,notice,fra,Syndicat patronal,,14711


In [4]:
formations_info = db.query(f"""
    select
        ir.fk_associated_object as pk_coac, 
        i.pk_information, 
                           i.info_label,
        id.year, id.month, id.day,
        id.fk_abob_type_information_date,
        id.complement as complement, 
        id.notes as notes,
        id.certainty_date
    from bhp.information_role ir
    inner join bhp.information i on i.pk_information = ir.fk_information and i.fk_type_information = 30
    inner join bhp.information_date id on id.fk_information = i.pk_information
    where ir.fk_associated_object in {values}
""")

formations_info['pk_coac'] = formations_info['pk_coac'].str.replace('CoAc', '', regex=False)
formations_info['pk_coac'] = formations_info['pk_coac'].astype(pd.Int64Dtype())
formations_info['year'] = formations_info['year'].astype(pd.Int64Dtype())
formations_info['month'] = formations_info['month'].astype(pd.Int64Dtype())
formations_info['day'] = formations_info['day'].astype(pd.Int64Dtype())
formations_info['fk_abob_type_information_date'] = formations_info['fk_abob_type_information_date'].astype(pd.Int64Dtype())
formations_info['date_bhp'] = [(row.year, row.month, row.day) for i, row in formations_info.iterrows()]
formations_info['uri'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in formations_info['pk_information']]
formations_info.drop(columns=['year', 'month', 'day', 'pk_information'], inplace=True)
formations_info['complement'] = [pd.NA if pd.isna(row['complement']) or row['complement'].strip() == '' else row['complement'] for _,row in formations_info.iterrows()]
formations_info['notes'] = [pd.NA if pd.isna(row['notes']) or row['notes'].strip() == '' else row['notes'] for _,row in formations_info.iterrows()]
formations_info['notes'] = [s.replace('<p>', '').replace('</p>', '') if pd.notna(s) else pd.NA for s in formations_info['notes']]
formations_info['notes'] = [s.replace('<em>', '').replace('</em>', '') if pd.notna(s) else pd.NA for s in formations_info['notes']]
formations_info['complement'] = [e.replace('<p>', '').replace('</p>', '') if pd.notna(e) else pd.NA for e in formations_info['complement']]

# a.infos(formations)

In [5]:
dissolutions_info = db.query(f"""
    select
        ir.fk_associated_object as pk_coac, 
        i.pk_information, 
        id.year, id.month, id.day,
        id.fk_abob_type_information_date,
        id.complement as complement, 
        id.notes as notes,
        id.certainty_date
    from bhp.information_role ir
    inner join bhp.information i on i.pk_information = ir.fk_information and i.fk_type_information = 33
    inner join bhp.information_date id on id.fk_information = i.pk_information
    where ir.fk_associated_object in {values}
""")

dissolutions_info['pk_coac'] = dissolutions_info['pk_coac'].str.replace('CoAc', '', regex=False)
dissolutions_info['pk_coac'] = dissolutions_info['pk_coac'].astype(pd.Int64Dtype())
dissolutions_info['year'] = dissolutions_info['year'].astype(pd.Int64Dtype())
dissolutions_info['month'] = dissolutions_info['month'].astype(pd.Int64Dtype())
dissolutions_info['day'] = dissolutions_info['day'].astype(pd.Int64Dtype())
dissolutions_info['fk_abob_type_information_date'] = dissolutions_info['fk_abob_type_information_date'].astype(pd.Int64Dtype())
dissolutions_info['date_bhp'] = [(row.year, row.month, row.day) for i, row in dissolutions_info.iterrows()]
dissolutions_info['uri'] = ['http://symogih.org/resource/Info' + str(fk_info) for fk_info in dissolutions_info['pk_information']]
dissolutions_info.drop(columns=['year', 'month', 'day', 'pk_information'], inplace=True)
dissolutions_info['complement'] = [pd.NA if pd.isna(row['complement']) or row['complement'].strip() == '' else row['complement'] for _,row in dissolutions_info.iterrows()]
dissolutions_info['notes'] = [pd.NA if pd.isna(row['notes']) or row['notes'].strip() == '' else row['notes'] for _,row in dissolutions_info.iterrows()]
dissolutions_info['complement'] = [e.replace('<p>', '').replace('</p>', '') if pd.notna(e) else pd.NA for e in dissolutions_info['complement']]

# a.infos(dissolutions)

In [6]:
db.disconnect()

[DB] Database correctly disconnected.


### GV infos

In [7]:
db.connect_geovistory('prod', skip_protection=True)

groups = db.query(f"""
    select
        r.pk_entity as pk_gv,
        a3.string as name
    from information.resource r
    inner join information.statement s1 on s1.fk_object_info = r.pk_entity and s1.fk_property = {pks.properties.apial_isAppelationForLanguageOf_entity}
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_subject_info and s2.fk_property = {pks.properties.aial_refersToName_appellation}
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r.fk_class = {pks.classes.group}                  
""")
db.disconnect()

a.infos(groups)

[DB] Requests will not be executed
[DB] Connecting to PRODUCTION Database ... Connected!
[DB] Database correctly disconnected.
Shape:  (7012, 2) - extract:


Unnamed: 0,pk_gv,name
0,648250,Großherzogtum Baden
1,648267,Königreich Württemberg
2,648267,Würtemberg
3,648284,Kanton Aargau
4,837196,ordo senatorius


## Record linkage

In [None]:
# collective_actors = coacs[['pk_collective_actor', 'name']].rename(columns={'pk_collective_actor':'pk_bhp'})

# # Prepare strings
# collective_actors['name_compare'] = [unidecode(s.lower()) for s in collective_actors.name]
# groups['name_compare'] = [unidecode(s.lower()) for s in groups.name]

# threshold = 0.5

# similars = []
# eta.begin(len(collective_actors), 'Finding similars')
# for i, row_bhp in collective_actors.iterrows():
#     for j, row_gv in groups.iterrows():
#         score = u.trigram_similarity(row_bhp['name_compare'], row_gv['name_compare'])
#         if score >= threshold: 
#             # eta.print(f'Found, score {score}: (' + str(row_bhp['pk_bhp']) + ') <' + str(row_bhp['name']) + '> - <' + str(row_gv['name']) + '> (' + str(row_gv['pk_gv']) + ')')
#             similars.append({
#                 'score': score,
#                 'pk_bhp': row_bhp['pk_bhp'],
#                 'name_bhp': row_bhp['name'],
#                 'name_gv': row_gv['name'],
#                 'pk_gv': row_gv['pk_gv']
#             })
#     eta.iter()
# eta.end()

# similars = pd.DataFrame(data=similars)
# similars.sort_values('score', ascending=False, inplace=True)
# similars.drop_duplicates(['pk_bhp', 'pk_gv'], inplace=True)
# u.save_df(similars, '../../data/record-linkage-collective-actors.csv')

# a.infos(similars)

# 2h22m15s

## Existing Groups

In [8]:
record_linkage = u.read_df('../../data/record-linkage-collective-actors-filled.csv', sep=',', skip_info=True)
record_linkage = record_linkage[record_linkage['Doublon'] == "oui"]
# record_linkage = record_linkage[['pk_bhp', 'pk_gv']]

a.infos(record_linkage)
not_to_create = record_linkage['pk_bhp'].tolist()

Shape:  (127, 6) - extract:


Unnamed: 0,score,pk_bhp,Doublon,name_bhp,name_gv,pk_gv
0,1.0,13762,oui,Carmel de Saint-Joseph et de Sainte-Thérèse (N...,Carmel de Saint-Joseph et de Sainte-Thérèse (N...,6141350
1,1.0,13761,oui,Carmel de Notre-Dame des neiges (Nancy I),Carmel de Notre-Dame des Neiges (Nancy I),6141170
2,1.0,13760,oui,Carmel de Morlaix I,Carmel de Morlaix I,6141577
3,1.0,25,oui,Ordo fratrum praedicatorum,Ordo Fratrum Prædicatorum,1859975
4,1.0,13756,oui,Carmel de Saint-Joseph et de Sainte-Thérèse (C...,Carmel de Saint-Joseph et de Sainte-Thérèse (C...,6140898


In [9]:
# Check to verify that a single bhp entity correspond at most at one gv entity
record_linkage[record_linkage.duplicated('pk_bhp', keep=False)].sort_values('pk_bhp')

Unnamed: 0,score,pk_bhp,Doublon,name_bhp,name_gv,pk_gv


## New groups

In [10]:
db.connect_geovistory(env, pk_project, execute)

[DB] Connecting to STAGING Database ... Connected!


### Create Groups

In [11]:
# Prepare
selection = coacs[['pk_collective_actor']].drop_duplicates()

# Remove those already existing (record linkage)
selection = selection[[pk_bhp not in not_to_create for pk_bhp in selection['pk_collective_actor']]]

# Create data
selection['pk_gv'] = db.resources.create(pks.classes.group, len(selection))

# Merge into dataframe - new ones
coacs = coacs.merge(selection, on='pk_collective_actor', how='left')
# Merge into dataframe - from record linkage
coacs = coacs.merge(record_linkage, left_on='pk_collective_actor', right_on='pk_bhp', how='left').drop(columns=['pk_bhp'])

coacs['pk_gv'] = [row['pk_gv_x'] if pd.notna(row['pk_gv_x']) else row['pk_gv_y'] for _,row in coacs.iterrows()]
coacs['pk_gv'] = coacs['pk_gv'].astype(pd.Int64Dtype())
coacs.drop(columns=['pk_gv_x', 'pk_gv_y'], inplace=True)

# 11s

Creating 21882 resources of class [68] ... Done in [00h00'03]
Creating info_proj_rel of 21882 entities with project <11850066> ... Done in [00h00'08]


### Create URIs

In [12]:
# Prepare
coacs['uri'] = 'http://symogih.org/resource/CoAc' + coacs['pk_collective_actor'].astype(str)
selection = coacs[['pk_gv', 'uri']].drop_duplicates()

# Create data
db.shortcuts.add_uris(selection['pk_gv'], selection['uri'])

# 1m20s

Creating 22009 resources of class [967] ... Done in [00h00'03]
Creating info_proj_rel of 22009 entities with project <11850066> ... Done in [00h00'08]
Creating 22009 appellations ... Done in [00h00'19]
Creating 22009 statements ... Updating metadata ... Done in [00h00'18]
Creating info_proj_rel of 22009 entities with project <11850066> ... Done in [00h00'07]
Creating 22009 statements ... Updating metadata ... Done in [00h00'16]
Creating info_proj_rel of 22009 entities with project <11850066> ... Done in [00h00'07]


### Create names

In [13]:
def get_appe_type(fk_abob_type):
    if pd.isna(fk_abob_type): return pd.NA
    if fk_abob_type == 1253: return 1645890
    if fk_abob_type == 1051: return 1645890 # cf discussion avec VA sur Discord
    if fk_abob_type == 1270: return 1661195
    if fk_abob_type == 1063: return 8067077
    return pd.NA

In [14]:
# Prepare
selection = coacs[['pk_gv', 'name', 'lang_name', 'comment_begin_year_name', 'comment_end_year_name', 'notes_name', 'fk_abob_coac_name_type', 'begin_date_name', 'end_date_name']].drop_duplicates(['pk_gv', 'name']).copy()
selection['pk_lang_name'] = [pks.languages.from_iso_code(lang) if pd.notna(lang) else pd.NA for lang in selection['lang_name']]
selection['pk_aial_type'] = [get_appe_type(t) for t in selection.fk_abob_coac_name_type]

# Create - AiaL
selection['pk_aial'] = db.resources.create(pks.classes.aial, len(selection))
db.statements.create(selection['pk_aial'], pks.properties.apial_isAppelationForLanguageOf_entity, selection['pk_gv'])

# Create - Appellation
selection['pk_appe_name'] = db.appellations.create(selection['name'])
db.statements.create(selection['pk_aial'], pks.properties.aial_refersToName_appellation, selection['pk_appe_name'])

# Create - Language
selection_lang = selection[pd.notna(selection['pk_lang_name'])] # Because here, we have names without a language
db.statements.create(selection_lang['pk_aial'], pks.properties.apial_usedInLanguage_language, selection_lang['pk_lang_name'])

# Create - Type
selection_type = selection[pd.notna(selection['pk_aial_type'])]
db.statements.create(selection_type['pk_aial'], pks.properties.aial_hasType_aialType, selection_type['pk_aial_type'])

# Create - Dates
def get_duration(date):
    if pd.notna(date[0]) and pd.isna(date[1]) and pd.isna(date[2]): return '1 year'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.isna(date[2]): return '1 month'
    if pd.notna(date[0]) and pd.notna(date[1]) and pd.notna(date[2]): return '1 day'
    return pd.NA

# Create - Dates - Begin
selection_date_begin = selection[pd.notna(selection['begin_date_name'])].copy()
selection_date_begin['duration'] = [get_duration(d) for d in selection_date_begin['begin_date_name']]
selection_date_begin['pk_tp'] = db.time_primitives.create(selection_date_begin['begin_date_name'], selection_date_begin['duration'])
db.statements.create(selection_date_begin['pk_aial'], pks.properties.timespan_endOfTheBegin_timePrim, selection_date_begin['pk_tp'])

# Create - Dates - End
selection_date_end = selection[pd.notna(selection['end_date_name'])].copy()
selection_date_end['duration'] = [get_duration(d) for d in selection_date_end['end_date_name']]
selection_date_end['pk_tp'] = db.time_primitives.create(selection_date_end['end_date_name'], selection_date_end['duration'])
db.statements.create(selection_date_end['pk_aial'], pks.properties.timespan_endOfTheEnd_timePrim, selection_date_end['pk_tp'])

# 2m

Creating 23699 resources of class [365] ... Done in [00h00'03]
Creating info_proj_rel of 23699 entities with project <11850066> ... Done in [00h00'08]
Creating 23699 statements ... Updating metadata ... Done in [00h00'18]
Creating info_proj_rel of 23699 entities with project <11850066> ... Done in [00h00'09]
Creating 23699 appellations ... Done in [00h00'21]
Creating 23699 statements ... Updating metadata ... Done in [00h00'19]
Creating info_proj_rel of 23699 entities with project <11850066> ... Done in [00h00'08]
Creating 10598 statements ... Updating metadata ... Done in [00h00'08]
Creating info_proj_rel of 10598 entities with project <11850066> ... Done in [00h00'05]
Creating 8985 statements ... Updating metadata ... Done in [00h00'08]
Creating info_proj_rel of 8985 entities with project <11850066> ... Done in [00h00'04]
Creating 1312 time primitives ... Done in [00h00'00]
Creating 1312 statements ... Updating metadata ... Done in [00h00'01]
Creating info_proj_rel of 1312 entities w

### Create definitions

In [15]:
# Prepare
selection = coacs[['pk_gv', 'notes_coac', 'text', 'property_type', 'lang_text_prop']].copy().drop_duplicates()
selection = selection[pd.notna(selection['notes_coac']) | pd.notna(selection['text'])]
selection = selection[selection['notes_coac'].astype(str) != selection['text'].astype(str)]
selection['note'] = ['[Note] ' + s if pd.notna(s) else pd.NA for s in selection['notes_coac']]
selection['lang_note'] = 'fra'
selection['text_prop'] = ['[Complément] ' + str(row['text']) if pd.notna(row['property_type']) and row['property_type'] == 'complément' else row['text'] for _,row in selection.iterrows()]
selection['lang_text_prop'] = [pd.NA if pd.notna(d) and d == 'None' else d for d in selection['lang_text_prop']]

definitions = pd.concat([
    selection[['pk_gv', 'note', 'lang_note']].dropna(subset='note').rename(columns={'note':'definition', 'lang_note':'lang'}), 
    selection[['pk_gv', 'text_prop', 'lang_text_prop']].dropna(subset='text_prop').rename(columns={'text_prop':'definition', 'lang_text_prop':'lang'})
])
definitions['lang'].fillna('fra', inplace=True)
definitions['pk_lang'] = [pks.languages.from_iso_code(c) for c in definitions['lang']]

# Create data
db.shortcuts.add_definitions(definitions['pk_gv'].tolist(), definitions['definition'].tolist(), definitions['pk_lang'].tolist())

# 17m30

Creating 17516 resources of class [899] ... Done in [00h00'02]
Creating info_proj_rel of 17516 entities with project <11850066> ... Done in [00h00'06]
Creating 17516 appellations ... Done in [00h16'16]
Creating 17516 statements ... Updating metadata ... Done in [00h00'13]
Creating info_proj_rel of 17516 entities with project <11850066> ... Done in [00h00'08]
Creating 17516 statements ... Updating metadata ... Done in [00h00'15]
Creating info_proj_rel of 17516 entities with project <11850066> ... Done in [00h00'05]
Creating 17516 statements ... Updating metadata ... Done in [00h00'15]
Creating info_proj_rel of 17516 entities with project <11850066> ... Done in [00h00'05]


### Create formation

In [24]:
formation = coacs[['pk_gv', 'pk_collective_actor', 'begin_year_coac', 'certainty_begin', 'notes_begin']]
formation = formation.merge(formations_info, left_on='pk_collective_actor', right_on='pk_coac', how='left').drop(columns=['pk_coac'])

# Date
formation['date'] = [row['date_bhp'] if pd.notna(row['date_bhp']) else ((row['begin_year_coac'], pd.NA, pd.NA) if pd.notna(row['begin_year_coac']) else pd.NA) for _,row in formation.iterrows()]
formation.drop(columns=['date_bhp', 'begin_year_coac'], inplace=True)

# Property
def get_property(note_begin, fk_type):
    if pd.notna(fk_type):
        if fk_type == 246: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive # P82
        if fk_type == 1125: return pks.properties.timespan_beginOfTheBegin_timePrim # P82a
        if fk_type == 1126: return pks.properties.timespan_endOfTheEnd_timePrim # P82b
        if fk_type == 258: return pks.properties.timeSpan_ongoingThroughout_timePrimitive #P81
        if fk_type == 1289: return pks.properties.timespan_endOfTheBegin_timePrim # P81a
        if fk_type == 1290: return pks.properties.timespan_beginOfTheEnd_timePrim # P81b
        if fk_type == 1321: return pks.properties.timespan_beginOfTheBegin_timePrim
        if fk_type == 1322: return pks.properties.timespan_beginOfTheBegin_timePrim
        if fk_type == 1323: return pks.properties.timespan_endOfTheBegin_timePrim
        if fk_type == 256: return pks.properties.timespan_endOfTheEnd_timePrim
        if fk_type == 1127: return pks.properties.timespan_beginOfTheEnd_timePrim
        if fk_type == 1128: return pks.properties.timespan_endOfTheEnd_timePrim
        return pd.NA
    elif pd.notna(note_begin):
        if note_begin == 1: return pks.properties.timespan_endOfTheBegin_timePrim
        if note_begin == 2: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
        if note_begin == 3: return pks.properties.timeSpan_ongoingThroughout_timePrimitive
        if note_begin == 4: return pks.properties.timespan_beginOfTheEnd_timePrim
        return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
    else: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
formation['pk_property'] = [get_property(row['notes_begin'], row['fk_abob_type_information_date']) for _,row in formation.iterrows()]
formation.drop(columns=['fk_abob_type_information_date', 'notes_begin'], inplace=True)

# Certainty
formation['certainty'] = [row['certainty_date'] if pd.notna(row['certainty_date']) else row['certainty_begin'] for _,row in formation.iterrows()]
formation.drop(columns=['certainty_begin', 'certainty_date'], inplace=True)

# Notes 
formation['notes'] = ['[Note] ' + str(e) if pd.notna(e) else pd.NA for e in formation['notes']]

# 3s

In [25]:
# Prepare
selection = formation[pd.notna(formation['date'])].copy()

# Create Formation
selection['pk_formation'] = db.resources.create(pks.classes.formation, len(selection))
db.statements.create(selection['pk_formation'], pks.properties.formation_hasFormed_group, selection['pk_collective_actor'])

# Link to date
selection['duration'] = [get_duration(d) for d in selection['date']]
selection['pk_time_prim'] = db.time_primitives.create(selection['date'], selection['duration'])

# Certainty comment
selection_certainty_comment = selection[(selection['certainty'] == 2) | (selection['certainty'] == 3)].copy()
selection_certainty_comment['comment'] = ['Date reconstituée' if c == 2 else 'Date postulée' for c in selection_certainty_comment['certainty']]
selection_certainty_comment['pk_certainty_comment'] = db.resources.create(pks.classes.comment, len(selection_certainty_comment))
selection_certainty_comment['pk_appe'] = db.appellations.create(selection_certainty_comment['comment'])
db.statements.create(selection_certainty_comment['pk_certainty_comment'], pks.properties.text_hasValueVersion_string, selection_certainty_comment['pk_appe'])
db.statements.create(selection_certainty_comment['pk_certainty_comment'], pks.properties.comment_hasCommentType_CommentType, 7953586)

# URI
selection_uri = selection[pd.notna(selection['uri'])]
db.shortcuts.add_uris(selection_uri['pk_formation'], selection_uri['uri'])

# Compléments sur la date
selection_cplmt_date = selection[pd.notna(selection['complement'])].copy()
selection_cplmt_date['pk_comment'] = db.resources.create(pks.classes.comment, len(selection_cplmt_date))
db.statements.create(selection_cplmt_date['pk_comment'], pks.properties.comment_hasCommentType_CommentType, 8065621) # Complément sur la date
selection_cplmt_date['pk_appe'] = db.appellations.create(selection_cplmt_date['complement'])
db.statements.create(selection_cplmt_date['pk_comment'], pks.properties.text_hasValueVersion_string, selection_cplmt_date['pk_appe'])

# Notes sur la date
selection_notes_date = selection[pd.notna(selection['notes'])].copy()
selection_notes_date['pk_comment'] = db.resources.create(pks.classes.comment, len(selection_notes_date))
db.statements.create(selection_notes_date['pk_comment'], pks.properties.comment_hasCommentType_CommentType, 8065632) # Notes sur la date
selection_notes_date['pk_appe'] = db.appellations.create(selection_notes_date['notes'])
db.statements.create(selection_notes_date['pk_comment'], pks.properties.text_hasValueVersion_string, selection_notes_date['pk_appe'])

# 1m22s

Creating 13717 resources of class [60] ... Done in [00h00'02]
Creating info_proj_rel of 13717 entities with project <11850066> ... Done in [00h00'05]
Creating 13717 statements ... Updating metadata ... Done in [00h00'11]
Creating info_proj_rel of 13717 entities with project <11850066> ... Done in [00h00'06]
Creating 13717 time primitives ... Done in [00h00'01]
Creating 4518 resources of class [900] ... Done in [00h00'01]
Creating info_proj_rel of 4518 entities with project <11850066> ... Done in [00h00'02]
Creating 4518 appellations ... Done in [00h00'01]
Creating 4518 statements ... Updating metadata ... Done in [00h00'04]
Creating info_proj_rel of 4518 entities with project <11850066> ... Done in [00h00'02]
Creating 4518 statements ... Updating metadata ... Done in [00h00'04]
Creating info_proj_rel of 4518 entities with project <11850066> ... Done in [00h00'02]
Creating 9307 resources of class [967] ... Done in [00h00'01]
Creating info_proj_rel of 9307 entities with project <11850066

### Create dissolutions

In [26]:
dissolution = coacs[['pk_gv', 'pk_collective_actor', 'end_year_coac', 'certainty_end', 'notes_end']]
dissolution = dissolution.merge(dissolutions_info, left_on='pk_collective_actor', right_on='pk_coac', how='left').drop(columns=['pk_coac'])

# Date
dissolution['date'] = [row['date_bhp'] if pd.notna(row['date_bhp']) else ((row['end_year_coac'], pd.NA, pd.NA) if pd.notna(row['end_year_coac']) else pd.NA) for _,row in dissolution.iterrows()]
dissolution.drop(columns=['date_bhp', 'end_year_coac'], inplace=True)

# Property
def get_property(note_end, fk_type):
    if pd.notna(fk_type):
        if fk_type == 246: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive # P82
        if fk_type == 1125: return pks.properties.timespan_beginOfTheBegin_timePrim # P82a
        if fk_type == 1126: return pks.properties.timespan_endOfTheEnd_timePrim # P82b
        if fk_type == 258: return pks.properties.timeSpan_ongoingThroughout_timePrimitive #P81
        if fk_type == 1289: return pks.properties.timespan_endOfTheBegin_timePrim # P81a
        if fk_type == 1290: return pks.properties.timespan_beginOfTheEnd_timePrim # P81b
        if fk_type == 1321: return pks.properties.timespan_beginOfTheBegin_timePrim
        if fk_type == 1322: return pks.properties.timespan_beginOfTheBegin_timePrim
        if fk_type == 1323: return pks.properties.timespan_endOfTheBegin_timePrim
        if fk_type == 256: return pks.properties.timespan_endOfTheEnd_timePrim
        if fk_type == 1127: return pks.properties.timespan_beginOfTheEnd_timePrim
        if fk_type == 1128: return pks.properties.timespan_endOfTheEnd_timePrim
        return pd.NA
    elif pd.notna(note_end):
        if note_end == 1: return pks.properties.timespan_endOfTheBegin_timePrim
        if note_end == 2: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
        if note_end == 3: return pks.properties.timeSpan_ongoingThroughout_timePrimitive
        if note_end == 4: return pks.properties.timespan_beginOfTheEnd_timePrim
        return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
    else: return pks.properties.timeSpan_atSomeTimeWithin_timePrimitive
dissolution['pk_property'] = [get_property(row['notes_end'], row['fk_abob_type_information_date']) for _,row in dissolution.iterrows()]
dissolution.drop(columns=['fk_abob_type_information_date', 'notes_end'], inplace=True)

# Certainty
dissolution['certainty'] = [row['certainty_date'] if pd.notna(row['certainty_date']) else row['certainty_end'] for _,row in dissolution.iterrows()]
dissolution.drop(columns=['certainty_end', 'certainty_date'], inplace=True)

# Notes 
dissolution['notes'] = ['[Note] ' + str(e) if pd.notna(e) else pd.NA for e in dissolution['notes']]

# 3s

In [29]:

selection_notes_date = selection[pd.notna(selection['notes'])].copy()
selection_notes_date

Unnamed: 0,pk_gv,pk_collective_actor,complement,notes,uri,date,pk_property,certainty,pk_dissolution,duration,pk_time_prim
18972,11460498,15408,,[Note] <p>Non documenté après 1935</p>,http://symogih.org/resource/Info112695,"(1935, <NA>, <NA>)",72,3.0,11846228,1 year,742444
18973,11460498,15408,,[Note] <p>Non documenté après 1935</p>,http://symogih.org/resource/Info112695,"(1935, <NA>, <NA>)",72,3.0,11846229,1 year,742444


In [27]:
# Prepare
selection = dissolution[pd.notna(dissolution['date'])].copy()

# Create dissolution
selection['pk_dissolution'] = db.resources.create(pks.classes.dissolution, len(selection))
db.statements.create(selection['pk_dissolution'], pks.properties.dissolution_dissolved_group, selection['pk_collective_actor'])

# Link to date
selection['duration'] = [get_duration(d) for d in selection['date']]
selection['pk_time_prim'] = db.time_primitives.create(selection['date'], selection['duration'])

# Certainty comment
selection_certainty_comment = selection[(selection['certainty'] == 2) | (selection['certainty'] == 3)].copy()
selection_certainty_comment['comment'] = ['Date reconstituée' if c == 2 else 'Date postulée' for c in selection_certainty_comment['certainty']]
selection_certainty_comment['pk_certainty_comment'] = db.resources.create(pks.classes.comment, len(selection_certainty_comment))
selection_certainty_comment['pk_appe'] = db.appellations.create(selection_certainty_comment['comment'])
db.statements.create(selection_certainty_comment['pk_certainty_comment'], pks.properties.text_hasValueVersion_string, selection_certainty_comment['pk_appe'])
db.statements.create(selection_certainty_comment['pk_certainty_comment'], pks.properties.comment_hasCommentType_CommentType, 7953586)

# URI
selection_uri = selection[pd.notna(selection['uri'])]
db.shortcuts.add_uris(selection_uri['pk_dissolution'], selection_uri['uri'])

# Compléments sur la date
selection_cplmt_date = selection[pd.notna(selection['complement'])].copy()
selection_cplmt_date['pk_comment'] = db.resources.create(pks.classes.comment, len(selection_cplmt_date))
db.statements.create(selection_cplmt_date['pk_comment'], pks.properties.comment_hasCommentType_CommentType, 8065621) # Complément sur la date
selection_cplmt_date['pk_appe'] = db.appellations.create(selection_cplmt_date['complement'])
db.statements.create(selection_cplmt_date['pk_comment'], pks.properties.text_hasValueVersion_string, selection_cplmt_date['pk_appe'])

# Notes sur la date
selection_notes_date = selection[pd.notna(selection['notes'])].copy()
selection_notes_date['pk_comment'] = db.resources.create(pks.classes.comment, len(selection_notes_date))
db.statements.create(selection_notes_date['pk_comment'], pks.properties.comment_hasCommentType_CommentType, 8065632) # Notes sur la date
selection_notes_date['pk_appe'] = db.appellations.create(selection_notes_date['notes'])
db.statements.create(selection_notes_date['pk_comment'], pks.properties.text_hasValueVersion_string, selection_notes_date['pk_appe'])

Creating 7592 resources of class [62] ... Done in [00h00'01]
Creating info_proj_rel of 7592 entities with project <11850066> ... Done in [00h00'03]
Creating 7592 statements ... Updating metadata ... Done in [00h00'06]
Creating info_proj_rel of 7592 entities with project <11850066> ... Done in [00h00'03]
Creating 7592 time primitives ... Done in [00h00'00]
Creating 4235 resources of class [900] ... Done in [00h00'01]
Creating info_proj_rel of 4235 entities with project <11850066> ... Done in [00h00'01]
Creating 4235 appellations ... Done in [00h00'02]
Creating 4235 statements ... Updating metadata ... Done in [00h00'03]
Creating info_proj_rel of 4235 entities with project <11850066> ... Done in [00h00'01]
Creating 4235 statements ... Updating metadata ... Done in [00h00'04]
Creating info_proj_rel of 4235 entities with project <11850066> ... Done in [00h00'01]
Creating 64 resources of class [967] ... Done in [00h00'01]
Creating info_proj_rel of 64 entities with project <11850066> ... Don

ProgrammingError: (psycopg2.errors.SyntaxError) syntax error at or near "returning"
LINE 4:         returning pk_entity;
                ^

[SQL: 
        insert into information.resource
            (fk_class, community_visibility, metadata) values 
        returning pk_entity;
    ]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [33]:
formation

Unnamed: 0,pk_gv,pk_collective_actor,info_label,complement,notes,uri,date,pk_property,certainty
0,11443005,14725,,,,,"(1969, <NA>, <NA>)",72,1
1,11443006,9712,,,,,"(1945, <NA>, <NA>)",72,1
2,11443007,13649,,,,,,72,1
3,11443007,13649,,,,,,72,1
4,11443007,13649,,,,,,72,1
...,...,...,...,...,...,...,...,...,...
25044,11464886,14385,,,,,"(1904, <NA>, <NA>)",72,1
25045,11464886,14385,,,,,"(1904, <NA>, <NA>)",72,1
25046,11464886,14385,,,,,"(1904, <NA>, <NA>)",72,1
25047,11464886,14385,,,,,"(1904, <NA>, <NA>)",72,1
