# Named Places

The goal here is to import Symogih named places according to the rules described in [this github issue](https://github.com/geovistory/symogih/issues/8).

In [154]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
import datetime
#import time
#import json
#import requests
#import duckdb
#import plotly.express as px
# from multiprocessing import Pool

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.importer as i
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.record_linkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Specific imports
# ...

# Global variables
# ...

# Connect to Geovistory database read mode
# db.connect_geovistory('prod')

# Connect to Geovistory database for insert
env = 'prod'
pk_project = pks.projects.symogih
execute = True
metadata_str = 'import-named-places'
import_manner = 'one-shot'
# db.connect_geovistory(env, pk_project, execute)
# db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
# db.set_insert_manner(import_manner)

# Connect to other database
db_url_env_var_name = 'YELLOW_BHP' # Name of an environment variable holding the Postgres database URL
db.connect_external(os.getenv(db_url_env_var_name), execute=False)

# Connect to a SPARQL endpoint
# sparql.connect_external('url')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[DB] Requests will not be executed
[DB] Connecting to PGSQL Database ... 

Connected!


# Fetch Symogih data

In [155]:
named_places = u.read_df('../../data/bhp/named_place.csv')

named_places = named_places[['pk_named_place', 'fk_abob_type_napl']]
named_places.rename(columns={'pk_named_place':'pk_bhp'}, inplace=True)

named_places['fk_abob_type_napl'] = named_places['fk_abob_type_napl'].astype(pd.Int64Dtype())

# a.infos(named_places)

In [156]:
names = u.read_df('../../data/bhp/named_place_name.csv')

names = names[['fk_named_place', 'name', 'lang_iso', 'fk_abob_napl_name_type', 'is_standard_name', 'begin_date', 'end_date', 'comment_begin_year', 'comment_end_year']]
names.rename(columns={'fk_named_place':'pk_bhp','is_standard_name':'name_favorite','begin_date':'name_datebegin','end_date':'name_dateend','comment_begin_year':'name_comment_begin','comment_end_year':'name_comment_end'}, inplace=True)

# a.infos(names)

In [157]:
definitions = u.read_df('../../data/bhp/named_place_text_property.csv')

definitions = definitions[['fk_named_place', 'text', 'lang_iso_code', 'property_type']]
definitions.rename(columns={'fk_named_place':'pk_bhp','text':'definition'}, inplace=True)

a.infos(definitions)


Shape:  (26549, 4) - extract:


Unnamed: 0,pk_bhp,definition,lang_iso_code,property_type
0,121395,Saarlouis,fra,notice
1,59,"Salvador de Bahia, capitale de l'état du même ...",fra,notice
2,3697,/ Capitale de l'Italie,fra,notice
3,10621,/ Capitale de la Suède,fra,notice
4,10747,/ Genève : doublon de lieu supprimé. Priére de...,fra,notice


## Manual correction

In [158]:
named_places = named_places[named_places['pk_bhp'] != 10747]
names = names[names['pk_bhp'] != 10747]
definitions = definitions[definitions['pk_bhp'] != 10747]

# Prepare data

## Kinds

In [159]:
# Columns handling

def get_kind(fk_abob_type):
    if fk_abob_type == 156: return 7507554
    if fk_abob_type == 245: return 7943874
    if fk_abob_type == 695: return 7943844
    if fk_abob_type == 696: return 7943924
    if fk_abob_type == 697: return 7943899
    if fk_abob_type == 698: return 7943949
    raise Exception(f"Not planned fk_abob_type: {fk_abob_type}")

named_places['kind'] = [get_kind(fk_abob_type) for fk_abob_type in named_places['fk_abob_type_napl']]
named_places.drop(columns=['fk_abob_type_napl'], inplace=True)

# a.infos(named_places)

## Names

In [160]:
# Columns handling

# Remove names without a name
names.dropna(subset=['name'], inplace=True)

# Languages
names['lang_iso'] = names['lang_iso'].fillna('fra')
names['name_lang'] = [pks.languages.from_iso_code(lang) for lang in names['lang_iso']]
names.drop(columns=['lang_iso'], inplace=True)

# Name type
names['fk_abob_napl_name_type'] = names['fk_abob_napl_name_type'].fillna(1253)
def get_name_type(fk_abob):
    if fk_abob == 1063: return 8067077
    if fk_abob == 1270: return 1661195
    if fk_abob == 1253 or fk_abob == 697: return 1645890
    raise Exception(f'Unknown name type {fk_abob}')
names['name_type'] = [get_name_type(type) for type in names['fk_abob_napl_name_type']]
names.drop(columns=['fk_abob_napl_name_type'], inplace=True)

# Comment type
names['name_comment_begin_type'] = 8257279
names['name_comment_end_type'] = 8257290

# Date begin
names['name_datebegin'] = [u.parse_date(d) for d in names['name_datebegin']]
names['name_dateend'] = [u.parse_date(d) for d in names['name_dateend']]

names = names[['pk_bhp', 'name', 'name_lang', 'name_type', 'name_favorite', 'name_datebegin', 'name_dateend', 'name_comment_begin', 'name_comment_begin_type', 'name_comment_end', 'name_comment_end_type']]

# a.infos(names)

In [161]:
# Merge back in main table

named_places = named_places.merge(names, how='left')

a.infos(named_places) # 128448

Shape:  (128446, 12) - extract:


Unnamed: 0,pk_bhp,kind,name,name_lang,name_type,name_favorite,name_datebegin,name_dateend,name_comment_begin,name_comment_begin_type,name_comment_end,name_comment_end_type
0,15922,7943899,Vèbre,19008,1645890,True,,,,8257279,,8257290
1,15923,7943899,Ventenac,19008,1645890,True,,,,8257279,,8257290
2,15924,7943899,Verdun,19008,1645890,True,,,,8257279,,8257290
3,15925,7943899,Vernajoul,19008,1645890,True,,,,8257279,,8257290
4,15926,7943899,Vernaux,19008,1645890,True,,,,8257279,,8257290


## Definitions

In [162]:
# Manual editing

def editing(definition):
    if definition.startswith('/ '): return definition[2:]
    else: return definition

definitions['definition'] = [editing(definition) for definition in definitions['definition']] 

In [163]:
# Columns handling

# Handle languages
definitions['lang_iso_code'].fillna('fra', inplace=True)
definitions['definition_lang'] = [pks.languages.from_iso_code(code) for code in definitions['lang_iso_code']]
definitions.drop(columns=['lang_iso_code'], inplace=True)

# Defintion type
def handle_definition_type(text, type):
    if type == 'notice': return text
    if type == 'complément': return '[Complément] ' + text
definitions['definition'] = [handle_definition_type(row['definition'], row['property_type']) for _, row in definitions.iterrows()]
definitions.drop(columns=['property_type'], inplace=True)

# a.infos(definitions)

In [164]:
# Merge back in main table

named_places = named_places.merge(definitions, how='left')

a.infos(named_places)

Shape:  (132644, 14) - extract:


Unnamed: 0,pk_bhp,kind,name,name_lang,name_type,name_favorite,name_datebegin,name_dateend,name_comment_begin,name_comment_begin_type,name_comment_end,name_comment_end_type,definition,definition_lang
0,15922,7943899,Vèbre,19008,1645890,True,,,,8257279,,8257290,,
1,15923,7943899,Ventenac,19008,1645890,True,,,,8257279,,8257290,,
2,15924,7943899,Verdun,19008,1645890,True,,,,8257279,,8257290,,
3,15925,7943899,Vernajoul,19008,1645890,True,,,,8257279,,8257290,,
4,15926,7943899,Vernaux,19008,1645890,True,,,,8257279,,8257290,,


## URIs

In [165]:
# Symogih URIs

uris = named_places[['pk_bhp']].drop_duplicates()
uris['uri'] = ['http://symogih.org/resource/NaPl' + str(pk) for pk in uris['pk_bhp']]

In [166]:
# External URIs

external_uris = db.query(f"""
    select
        np.pk_named_place as pk_bhp,
        ra.uri as namespace,
        d.identifier as id
    from bhp.named_place np
    inner join bhp.documentation d on d.fk_documented_object = np.concat_napl
    inner join bhp.resource_address_concatenation rac on rac.fk_digital_object = cast(substring(d.fk_documenting_entity, 5) as integer)
    inner join bhp.resource_address ra on ra.pk_resource_address = rac.fk_resource_address
""")

external_uris['uri'] = [row['namespace'] + row['id'] for _, row in external_uris.iterrows()]
external_uris.drop(columns=['namespace', 'id'], inplace=True)

# a.infos(external_uris, nb=20)

In [167]:
# Merge URIs

uris = pd.concat([uris, external_uris])
# a.infos(uris)

In [168]:
# Merge back in main table

named_places = named_places.merge(uris, how='left')

a.infos(named_places)

Shape:  (132671, 15) - extract:


Unnamed: 0,pk_bhp,kind,name,name_lang,name_type,name_favorite,name_datebegin,name_dateend,name_comment_begin,name_comment_begin_type,name_comment_end,name_comment_end_type,definition,definition_lang,uri
0,15922,7943899,Vèbre,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15922
1,15923,7943899,Ventenac,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15923
2,15924,7943899,Verdun,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15924
3,15925,7943899,Vernajoul,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15925
4,15926,7943899,Vernaux,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15926


## Presences

In [169]:
# Localisation spatiale

spa_loc = db.query(f"""
    select
        np.pk_named_place as pk_bhp,
        sl.y_min_decimal as lat,
        sl.x_min_decimal as lng,
        sl.pk_spatial_location as pk_sl
    from bhp.named_place np
    inner join bhp.locate l on l.fk_gazetteer_object = np.concat_napl
    inner join bhp.spatial_location sl on sl.pk_spatial_location = l.fk_spatial_location and sl.fk_abob_type_splo = 741
""")

# a.infos(spa_loc)

In [170]:
# Bounding Box

bou_box = db.query(f"""
    select
        np.pk_named_place as pk_bhp,
        sl.y_min_decimal as lat_min,
        sl.y_max_decimal as lat_max,
        sl.x_min_decimal as lng_min,
        sl.x_max_decimal as lng_max,
        sl.pk_spatial_location as pk_sl
    from bhp.named_place np
    inner join bhp.locate l on l.fk_gazetteer_object = np.concat_napl
    inner join bhp.spatial_location sl on sl.pk_spatial_location = l.fk_spatial_location and sl.fk_abob_type_splo = 742
""")

bou_box['lat'] = (bou_box['lat_min'] + bou_box['lat_max']) / 2
bou_box['lng'] = (bou_box['lng_min'] + bou_box['lng_max']) / 2

bou_box.drop(columns=['lat_min', 'lat_max', 'lng_min', 'lng_max'], inplace=True)

# a.infos(bou_box)

In [171]:
# Merge and format coordinates

localizations = pd.concat([spa_loc, bou_box])
localizations['place'] = [(row['lat'], row['lng']) for _, row in localizations.iterrows()]

# In order to be as simple as possible, the decision has been taken (discord discussion) to only take one presence: the last created
localizations.sort_values('pk_sl', inplace=True)
localizations.drop_duplicates(subset=['pk_bhp'], keep='last', inplace=True)

localizations.drop(columns=['lat', 'lng', 'pk_sl'], inplace=True)

# a.infos(localizations)

In [172]:
# Merge back in main table

named_places = named_places.merge(localizations, how='left')

a.infos(named_places) 

Shape:  (132671, 16) - extract:


Unnamed: 0,pk_bhp,kind,name,name_lang,name_type,name_favorite,name_datebegin,name_dateend,name_comment_begin,name_comment_begin_type,name_comment_end,name_comment_end_type,definition,definition_lang,uri,place
0,15922,7943899,Vèbre,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15922,
1,15923,7943899,Ventenac,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15923,
2,15924,7943899,Verdun,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15924,
3,15925,7943899,Vernajoul,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15925,
4,15926,7943899,Vernaux,19008,1645890,True,,,,8257279,,8257290,,,http://symogih.org/resource/NaPl15926,


In [173]:
len(named_places.pk_bhp.unique())

127419

# Record linkage

### Parse the record linkage result

In [174]:
rl_result = pd.read_csv('../../data/record-linkage-bhp-named-place-geov-geo-places.csv')
rl_result = rl_result[rl_result['doublon'] == 'oui']
rl_result['bhp_kind'] = pd.NA
rl_result['bhp_kind'] = [7943899 if row['bhp_type'] == 'Territoire' else row['bhp_kind'] for _, row in rl_result.iterrows()]
rl_result['bhp_kind'] = [7507554 if row['bhp_type'] == 'Lieu habité' else row['bhp_kind'] for _, row in rl_result.iterrows()]
rl_result['bhp_kind'] = [7943874 if row['bhp_type'] == 'Région géographique' else row['bhp_kind'] for _, row in rl_result.iterrows()]
rl_result['bhp_kind'] = [7943924 if row['bhp_type'] == 'Élément géographique naturel' else row['bhp_kind'] for _, row in rl_result.iterrows()]

rl_result['gv_kind'] = pd.NA
rl_result['gv_kind'] = [7943874 if row['gv_type'] == 'Geographical area' else row['gv_kind'] for _, row in rl_result.iterrows()]
rl_result['gv_kind'] = [7507554 if row['gv_type'] == 'Town' else row['gv_kind'] for _, row in rl_result.iterrows()]
rl_result['gv_kind'] = [7507554 if row['gv_type'] == 'City' else row['gv_kind'] for _, row in rl_result.iterrows()]
rl_result['gv_kind'] = [7507554 if row['gv_type'] == 'Village' else row['gv_kind'] for _, row in rl_result.iterrows()]
rl_result['gv_kind'] = [7507554 if row['gv_type'] == 'Municipului' else row['gv_kind'] for _, row in rl_result.iterrows()]

rl_result = rl_result[[pd.notna(row['bhp_kind']) and pd.notna(row['gv_kind']) and row['bhp_kind'] == row['gv_kind'] for _, row in rl_result.iterrows()]]

a.infos(rl_result)

rl_result = rl_result[['pk_bhp', 'pk_gv']]

Shape:  (367, 16) - extract:


Unnamed: 0,pk_bhp,pk_gv,bhp_name,gv_name,doublon,bhp_lat,gv_lat,bhp_lng,gv_lng,distance,bhp_definition,gv_definition,bhp_type,gv_type,bhp_kind,gv_kind
11,311,1751619,durazzo,durazzo,oui,41.326944,41.324167,19.454167,19.455833,0.3,Albania,.,Lieu habité,Town,7507554,7507554
13,475,762347,villach,villach,oui,46.615,46.6,13.8475,13.833333,2.0,Austria,,Lieu habité,Town,7507554,7507554
14,475,762347,villach,villaco,oui,46.615,46.6,13.8475,13.833333,2.0,Austria,,Lieu habité,Town,7507554,7507554
15,480,25774,wien,wien,oui,48.201944,48.20849,16.320833,16.37208,3.9,Austria,,Lieu habité,Town,7507554,7507554
16,480,25774,wien,wien,oui,48.201944,48.20849,16.320833,16.37208,3.9,Capitale de l'Autriche.,,Lieu habité,Town,7507554,7507554


### Put the record linkage into the main table

In [175]:
named_places = named_places.merge(rl_result, how='left')
named_places['pk_gv'] = named_places['pk_gv'].astype(pd.Int64Dtype())
named_places.rename(columns={'pk_gv':'pk_geoplace'}, inplace=True)

# Import

In [183]:
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

named_places.reset_index(inplace=True)

[DB] Connecting to PRODUCTION Database ... Connected!


## Create resources

In [48]:
# Add existing ones
existings = named_places['pk_geoplace'].dropna().drop_duplicates()
db.info_proj_rels.create(existings)

# Create new ones
selection = named_places[pd.isna(named_places['pk_geoplace'])][['pk_bhp', 'pk_geoplace']] \
                .drop_duplicates(subset=['pk_bhp']).copy()

selection['pk_geoplace'] = db.resources.create(pks.classes.geoPlace, len(selection))
named_places = named_places.merge(selection, on='pk_bhp', how='left')
named_places['pk_geoplace'] = [row['pk_geoplace_x'] if pd.notna(row['pk_geoplace_x']) else row['pk_geoplace_y'] for _, row in named_places.iterrows()]
if len(named_places[pd.isna(named_places['pk_geoplace'])]) != 0: raise Exception('Geo place not fully created')
named_places.drop(columns=['pk_geoplace_x', 'pk_geoplace_y'], inplace=True)
named_places['pk_geoplace'] = named_places['pk_geoplace'].astype(pd.Int64Dtype())

# Add kind
selection = named_places[['pk_geoplace', 'kind']].dropna().drop_duplicates()
db.statements.create(selection['pk_geoplace'], pks.properties.geoPlace_hasKind_geoPlaceKind, selection['kind'])

# Save in case of crashes/problem
u.write_df(named_places, 'save.csv')

[DB] Creating info_proj_rel of 289 entities with project <6857901> ... Done in [00h00m01s]
[DB] Creating 127137 resources of class [363] ... Done in [00h00m16s]
[DB] Creating info_proj_rel of 127137 entities with project <6857901> ... Done in [00h00m41s]
[DB] Creating 127426 statements ... Updating metadata ... Done in [00h01m36s]
[DB] Creating info_proj_rel of 127426 entities with project <6857901> ... Done in [00h00m57s]


## Create Geographical places names

In [49]:
# Prepare
selection = named_places[['index', 'pk_geoplace', 'name', 'name_lang']] \
                .dropna().drop_duplicates(subset=['pk_geoplace', 'name', 'name_lang']).copy()

# Create informations
selection["pk_aial_name"] = db.shortcuts.add_entity_names(selection['pk_geoplace'], selection['name'], selection['name_lang'], return_pk_aial=True)

# Merge back
named_places = named_places.merge(selection[['index', 'pk_aial_name']], on='index', how='left')
named_places['pk_aial_name'] = named_places['pk_aial_name'].astype(pd.Int64Dtype())

# Save in case of crashes/problem
u.write_df(named_places, 'save.csv')


[DB] Creating 128423 resources of class [365] ... Done in [00h00m16s]
[DB] Creating info_proj_rel of 128423 entities with project <6857901> ... Done in [00h00m45s]
[DB] Creating 128423 appellations ... Done in [00h00m40s]
[DB] Creating 128423 statements ... Updating metadata ... Done in [00h01m39s]
[DB] Creating info_proj_rel of 128423 entities with project <6857901> ... Done in [00h00m43s]
[DB] Creating 128423 statements ... Updating metadata ... Done in [00h01m37s]
[DB] Creating info_proj_rel of 128423 entities with project <6857901> ... Done in [00h00m47s]
[DB] Creating 128423 statements ... Updating metadata ... Done in [00h01m39s]
[DB] Creating info_proj_rel of 128423 entities with project <6857901> ... Done in [00h00m44s]


## Add type to Geographical place names

In [50]:
# Prepare
selection = named_places[['pk_aial_name', 'name_type']] \
                .dropna().drop_duplicates().copy()

# Create information
db.statements.create(selection['pk_aial_name'], pks.properties.aial_hasType_aialType, selection['name_type'])

[DB] Creating 128423 statements ... Updating metadata ... Done in [00h01m42s]
[DB] Creating info_proj_rel of 128423 entities with project <6857901> ... Done in [00h00m46s]


## Set favorite name

In [61]:
# Reset all favorites
iprs = db.query(f"""
    select
        ipr.pk_entity,
        r2.pk_entity as pk_aial_name
    from information.statement s 
    inner join information.resource r1 on r1.pk_entity = s.fk_object_info and r1.fk_class = {pks.classes.geoPlace}
    inner join information.resource r2 on r2.pk_entity = s.fk_subject_info and r2.fk_class = {pks.classes.aial}
    inner join projects.info_proj_rel ipr on ipr.fk_entity = s.pk_entity and ipr.fk_project = {pks.projects.symogih}
    where s.fk_property = {pks.properties.aial_isAppelationForLanguageOf_entity}
""")
db.execute(f"""
    update projects.info_proj_rel
        set ord_num_of_domain = NULL
    where pk_entity in {u.get_sql_ready_str(iprs['pk_entity'].tolist())};
""")

# Select the favorites
selection = named_places[['pk_aial_name', 'name_favorite']]
selection = selection[selection['name_favorite']]
favs = set(selection['pk_aial_name'].unique().tolist())
iprs_favs = iprs[[ipr in favs for ipr in iprs['pk_aial_name'].tolist()]]

# Update the favorites
db.execute(f"""
    update projects.info_proj_rel
        set ord_num_of_domain = 1
    where pk_entity in {u.get_sql_ready_str(iprs_favs['pk_entity'].tolist())};
""")


## Set name dates

---

In [12]:
named_places['pk_geoplace'] = named_places['pk_geoplace'].astype(pd.Int64Dtype())
named_places['pk_aial_name'] = named_places['pk_aial_name'].astype(pd.Int64Dtype())
named_places['name_datebegin'] = [eval(t) if pd.notna(t) else pd.NA for t in named_places['name_datebegin']]
named_places['name_dateend'] = [eval(t) if pd.notna(t) else pd.NA for t in named_places['name_dateend']]
named_places['name_comment_begin_type'] = named_places['name_comment_begin_type'].astype(pd.Int64Dtype())
named_places['name_comment_end_type'] = named_places['name_comment_end_type'].astype(pd.Int64Dtype())
named_places['place'] = [eval(t) if pd.notna(t) else pd.NA for t in named_places['place']]
named_places['definition_lang'] = named_places['definition_lang'].astype(pd.Int64Dtype())

In [13]:
# Prepare Begin
selection = named_places[['pk_aial_name', 'name_datebegin']] \
                .dropna().drop_duplicates()

# Create information
pk_date_begin = db.time_primitives.create(selection['name_datebegin'])
db.statements.create(selection['pk_aial_name'], pks.properties.timespan_beginOfTheBegin_timePrim, pk_date_begin)


# Prepare End
selection = named_places[['pk_aial_name', 'name_dateend']] \
                .dropna().drop_duplicates()

# Create information
pk_date_end = db.time_primitives.create(selection['name_dateend'])
db.statements.create(selection['pk_aial_name'], pks.properties.timespan_endOfTheEnd_timePrim, pk_date_end)      

[DB] Creating 4479 time primitives ... Done in [00h00m01s]
[DB] Creating 4479 statements ... Updating metadata ... Done in [00h00m03s]
[DB] Creating info_proj_rel of 4479 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4457 time primitives ... Done in [00h00m01s]
[DB] Creating 4457 statements ... Updating metadata ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 4457 entities with project <6857901> ... Done in [00h00m01s]


## Set name comment begin

In [14]:
# Prepare
selection = named_places[['pk_aial_name', 'name_comment_begin', 'name_comment_begin_type']] \
            .dropna().drop_duplicates(subset=['pk_aial_name', 'name_comment_begin'])

pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['name_comment_begin'])
db.statements.create(pk_comments, pks.properties.text_hasValueVersion_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, selection['name_comment_begin_type'])
db.statements.create(selection['pk_aial_name'], pks.properties.entity_hasComment_text, pk_comments)

[DB] Creating 4477 resources of class [900] ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 4477 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4477 appellations ... Done in [00h00m03s]
[DB] Creating 4477 statements ... Updating metadata ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 4477 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4477 statements ... Updating metadata ... Done in [00h00m03s]
[DB] Creating info_proj_rel of 4477 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4477 statements ... Updating metadata ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 4477 entities with project <6857901> ... Done in [00h00m02s]


## Set name comment end

In [15]:
# Prepare
selection = named_places[['pk_aial_name', 'name_comment_end', 'name_comment_end_type']] \
            .dropna().drop_duplicates(subset=['pk_aial_name', 'name_comment_end'])

# Create resources
pk_comments = db.resources.create(pks.classes.comment, len(selection))
pk_appes = db.appellations.create(selection['name_comment_end'])

# Create statements
db.statements.create(pk_comments, pks.properties.text_hasValueVersion_string, pk_appes)
db.statements.create(pk_comments, pks.properties.comment_hasCommentType_CommentType, selection['name_comment_end_type'])
db.statements.create(selection['pk_aial_name'], pks.properties.entity_hasComment_text, pk_comments)

[DB] Creating 4472 resources of class [900] ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 4472 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4472 appellations ... Done in [00h00m06s]
[DB] Creating 4472 statements ... Updating metadata ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 4472 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4472 statements ... Updating metadata ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 4472 entities with project <6857901> ... Done in [00h00m02s]
[DB] Creating 4472 statements ... Updating metadata ... Done in [00h00m05s]
[DB] Creating info_proj_rel of 4472 entities with project <6857901> ... Done in [00h00m01s]


## Set geographical places presence

In [16]:
# Prepare
selection = named_places[['pk_geoplace', 'place']] \
                .dropna().drop_duplicates()

# Create resources (and values)
pk_places = db.places.create(selection['place'])
pk_presences = db.resources.create(pks.classes.presence, len(selection))

# Create statements
db.statements.create(pk_presences, pks.properties.presence_wasAt_place, pk_places)
db.statements.create(pk_presences, pks.properties.presence_wasPresenceOf_spacetimeVolume, selection['pk_geoplace'])


[DB] Creating 81391 places ... Done in [01h04m49s]
[DB] Creating 81391 resources of class [84] ... Done in [00h00m12s]
[DB] Creating info_proj_rel of 81391 entities with project <6857901> ... Done in [00h00m34s]
[DB] Creating 81391 statements ... Updating metadata ... Done in [00h01m05s]
[DB] Creating info_proj_rel of 81391 entities with project <6857901> ... Done in [00h00m29s]
[DB] Creating 81391 statements ... Updating metadata ... Done in [00h01m08s]
[DB] Creating info_proj_rel of 81391 entities with project <6857901> ... Done in [00h00m31s]


## Set geographical places URIs

In [17]:
# Prepare
selection = named_places[['pk_geoplace', 'uri']] \
                .dropna().drop_duplicates()

# Create uris
db.shortcuts.add_uris(selection['pk_geoplace'], selection['uri'])

[DB] Creating 127445 resources of class [967] ... Done in [00h00m18s]
[DB] Creating info_proj_rel of 127445 entities with project <6857901> ... Done in [00h00m45s]
[DB] Creating 127445 appellations ... Done in [00h01m50s]
[DB] Creating 127445 statements ... Updating metadata ... Done in [00h01m47s]
[DB] Creating info_proj_rel of 127445 entities with project <6857901> ... Done in [00h00m46s]
[DB] Creating 127445 statements ... Updating metadata ... Done in [00h01m44s]
[DB] Creating info_proj_rel of 127445 entities with project <6857901> ... Done in [00h00m53s]


## Set geographical places definitions

In [18]:
# Prepare
selection = named_places[['pk_geoplace', 'definition', 'definition_lang']] \
                .dropna(subset=['definition']).drop_duplicates()

selection['definition_lang'] = selection['definition_lang'].astype(pd.Int64Dtype())

# Create definitions
db.shortcuts.add_definitions(selection['pk_geoplace'], selection['definition'], selection['definition_lang'])

[DB] Creating 10127 resources of class [899] ... Done in [00h00m02s]
[DB] Creating info_proj_rel of 10127 entities with project <6857901> ... Done in [00h00m03s]
[DB] Creating 10127 appellations ... Done in [00h17m04s]
[DB] Creating 10127 statements ... Updating metadata ... Done in [00h00m10s]
[DB] Creating info_proj_rel of 10127 entities with project <6857901> ... Done in [00h00m07s]
[DB] Creating 10127 statements ... Updating metadata ... Done in [00h00m09s]
[DB] Creating info_proj_rel of 10127 entities with project <6857901> ... Done in [00h00m04s]
[DB] Creating 10127 statements ... Updating metadata ... Done in [00h00m09s]
[DB] Creating info_proj_rel of 10127 entities with project <6857901> ... Done in [00h00m04s]


---

All definition with "Compléments" are missing:

In [184]:
# Get one that are missing
definitions_ = definitions[['[Complément]' in defi for defi in definitions['definition']]]
named_places = u.read_df('save.csv')

# Select only right datas
selection = named_places[['pk_bhp', 'pk_geoplace']].merge(definitions_, how='inner')
selection = selection[['pk_geoplace', 'definition', 'definition_lang']]
selection.drop_duplicates(inplace=True)

# Create definitions
db.shortcuts.add_definitions(selection['pk_geoplace'], selection['definition'], selection['definition_lang'])

[DB] Creating 16422 resources of class [899] ... Done in [00h00m02s]
[DB] Creating info_proj_rel of 16422 entities with project <6857901> ... Done in [00h00m06s]
[DB] Creating 16422 appellations ... Done in [00h00m07s]
[DB] Creating 16422 statements ... Updating metadata ... Done in [00h00m12s]
[DB] Creating info_proj_rel of 16422 entities with project <6857901> ... Done in [00h00m05s]
[DB] Creating 16422 statements ... Updating metadata ... Done in [00h00m12s]
[DB] Creating info_proj_rel of 16422 entities with project <6857901> ... Done in [00h00m06s]
[DB] Creating 16422 statements ... Updating metadata ... Done in [00h00m14s]
[DB] Creating info_proj_rel of 16422 entities with project <6857901> ... Done in [00h00m05s]


In [185]:
selection['length'] = [len(s) for s in selection['definition']]

selection.sort_values('length')

Unnamed: 0,pk_geoplace,definition,definition_lang,length
9469,11063695,[Complément] Iran,19008,17
9212,11062215,[Complément] 73122,19008,18
9214,11062239,[Complément] 73123,19008,18
9215,11062244,[Complément] 93131,19008,18
9216,11062245,[Complément] 93132,19008,18
...,...,...,...,...
6368,11052907,[Complément] http://it.wikipedia.org/wiki/Cast...,19008,63
103,11031691,[Complément] http://www.schwabenstaedte-in-bay...,19008,65
20587,11143022,[Complément] Etat de l'empire allemand formé d...,19008,138
20787,11144022,"[Complément] France, chef-lieu du département ...",19008,191
