In [3]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = -1
execute = False
metadata_str = ''
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.queries as q
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# db.connect_external(os.getenv(''))
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[DB] Requests will not be executed
[DB] Connecting to PRODUCTION Database ... Connected!


# Import Geographical place

## List record linkage

In [4]:
record_linkage = pd.read_csv('../../data/record-linkage-bhp-named-place-geov-geo-places-filled.csv')
record_linkage = record_linkage[record_linkage['doublon'] == 'oui']
record_linkage = record_linkage[['pk_bhp', 'pk_gv']].drop_duplicates()

a.infos(record_linkage)

Shape:  (453, 2) - extract:


Unnamed: 0,pk_bhp,pk_gv
0,1,205134
1,3,6201188
2,7,2220550
3,13,1485424
4,19,1876219


## Fetch all bhp named places

In [9]:
# Geo place + presence + definitions? + Comment?
named_places = u.read_df('../../data/bhp/named_place.csv', skip_info=True)
named_places = named_places[['pk_named_place', 'standard_latitude', 'standard_longitude', 'fk_abob_type_napl', 'notes', 'begin_year', 'notes_begin', 'end_year', 'notes_end']]
named_places.rename(columns={'pk_named_place':'pk_bhp','standard_latitude':'lat','standard_longitude':'lng','fk_abob_type_napl':'abob_type_geo_place', 'notes':'notes_geo_place'}, inplace=True)

# AIAL
named_places_names = u.read_df('../../data/bhp/named_place_name.csv', skip_info=True)
named_places_names = named_places_names[['fk_named_place', 'name','lang_iso','fk_abob_napl_name_type','begin_date', 'comment_begin_year', 'end_date', 'comment_end_year', 'notes']]
named_places_names.rename(columns={'fk_named_place':'pk_bhp','lang_iso':'lang_name','fk_abob_napl_name_type':'abob_type','fk_abob_type_napl':'abob_type'}, inplace=True)

# Text properties
named_places_text_prop = u.read_df('../../data/bhp/named_place_text_property.csv', skip_info=True)
named_places_text_prop = named_places_text_prop[['fk_named_place','text','lang_iso_code','property_type']]
named_places_text_prop.rename(columns={'fk_named_place':'pk_bhp','lang_iso_code':'lang_text_prop','property_type':'type_text_prop'}, inplace=True)


# Merge
named_places = named_places.merge(named_places_names, on='pk_bhp', how='left')
named_places = named_places.merge(named_places_text_prop, on='pk_bhp', how='left')


a.infos(named_places)

Shape:  (132681, 20) - extract:


Unnamed: 0,pk_bhp,lat,lng,abob_type_geo_place,notes_geo_place,begin_year,notes_begin,end_year,notes_end,name,lang_name,abob_type,begin_date,comment_begin_year,end_date,comment_end_year,notes,text,lang_text_prop,type_text_prop
0,15922,,,697,Commune existante au 1er janvier 2009. Importé...,,,,,Vèbre,,,,,,,Nom officiel au 1er janvier 2009. Importé à pa...,,,
1,15923,,,697,Commune existante au 1er janvier 2009. Importé...,,,,,Ventenac,,,,,,,Nom officiel au 1er janvier 2009. Importé à pa...,,,
2,15924,,,697,Commune existante au 1er janvier 2009. Importé...,,,,,Verdun,,,,,,,Nom officiel au 1er janvier 2009. Importé à pa...,,,
3,15925,,,697,Commune existante au 1er janvier 2009. Importé...,,,,,Vernajoul,,,,,,,Nom officiel au 1er janvier 2009. Importé à pa...,,,
4,15926,,,697,Commune existante au 1er janvier 2009. Importé...,,,,,Vernaux,,,,,,,Nom officiel au 1er janvier 2009. Importé à pa...,,,


## Precise Data

In [40]:
named_places['pk_kind'] = pd.NA

## Prepare data

In [41]:
table = named_places.merge(record_linkage, on='pk_bhp', how='left')
a.set_types(table, {'pk_gv':'int'})

## Existing Geographical places in Geovistory

In [51]:
db.connect_geovistory(env, -1, False, True)

pk_gv = table['pk_gv'].dropna().unique()
values = '(' + ','.join([str(e) for e in pk_gv]) + ')'

existing_names = db.query(f"""
    select distinct
        r.pk_entity as pk_gv, a3.string as name
    from information.resource r
    inner join information.statement s1 on s1.fk_object_info = r.pk_entity and s1.fk_property = {pks.properties.apial_isAppelationForLanguageOf_entity}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_subject_info and s2.fk_property = {pks.properties.aial_refersToName_appellation}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r.pk_entity in {values}                          
""")

existing_uris = db.query(f"""
    select distinct
        r.pk_entity as pk_gv, a6.string as uri
    from information.resource r
    inner join information.statement s4 on s4.fk_subject_info = r.pk_entity and s4.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr4 on ipr4.fk_entity = s4.pk_entity and ipr4.is_in_project = true
    inner join information.statement s5 on s5.fk_subject_info = s4.fk_object_info and s5.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr5 on ipr5.fk_entity = s5.pk_entity and ipr5.is_in_project = true
    inner join information.appellation a6 on a6.pk_entity = s5.fk_object_info
    where r.pk_entity in {values}
""")

existing_presence = db.query(f"""
    select distinct
        r.pk_entity as pk_gv, st_y(p9.geo_point::geometry) as lat, st_x(p9.geo_point::geometry) as lng
    from information.resource r
    inner join information.statement s7 on s7.fk_object_info = r.pk_entity and s7.fk_property = {pks.properties.presence_wasPresenceOf_spacetimeVolume}
    inner join projects.info_proj_rel ipr7 on ipr7.fk_entity = s7.pk_entity and ipr7.is_in_project = true
    inner join information.statement s8 on s8.fk_subject_info = s7.fk_subject_info and s8.fk_property = {pks.properties.presence_wasAt_place}
    inner join projects.info_proj_rel ipr8 on ipr8.fk_entity = s8.pk_entity and ipr8.is_in_project = true
    inner join information.v_place p9 on p9.pk_entity = s8.fk_object_info
    where r.pk_entity in {values}
""")
existing_presence['presence'] = [str(row['lat']) + ' ' + str(row['lng']) if pd.notna(row['lat']) or pd.notna(row['lng']) else pd.NA for i,row in existing_presence.iterrows()]
existing_presence.drop(columns=['lat', 'lng'], inplace=True)

existing_def = db.query(f"""
    select distinct
        r.pk_entity as pk_gv, a3.string as definition
    from information.resource r
    inner join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = {pks.properties.entity_hasDefinition_text}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.text_hasValueVersion_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r.pk_entity in {values}    
""")

db.disconnect()

existing_gv = table[['pk_gv']].dropna().drop_duplicates().copy()
existing_gv = existing_gv.merge(existing_names, how='left')
existing_gv = existing_gv.merge(existing_uris, how='left')
existing_gv = existing_gv.merge(existing_presence, how='left')
existing_gv = existing_gv.merge(existing_def, how='left')

existing_gv.sort_values('pk_gv', inplace=True)
existing_gv['uri'].fillna('', inplace=True)
existing_gv['name'].fillna('', inplace=True)
existing_gv['definition'].fillna('', inplace=True)
existing_gv['presence'].fillna('', inplace=True)

existing_gv = existing_gv.groupby('pk_gv').agg(
    names=pd.NamedAgg(column='name', aggfunc=lambda n: ';'.join(e for e in np.unique(n))),
    uris=pd.NamedAgg(column='uri', aggfunc=lambda n: ';'.join(e for e in np.unique(n))),
    presences=pd.NamedAgg(column='presence', aggfunc=lambda n: ';'.join(str(e) for e in np.unique(n))),
    defs=pd.NamedAgg(column='definition', aggfunc=lambda n: ';;'.join(e for e in np.unique(n)))
)
existing_gv.reset_index(inplace=True)
existing_gv['uris'] = existing_gv['uris'].replace('', pd.NA)

a.infos(existing_gv)

# 13s

[DB] Requests will not be executed
[DB] Connecting to STAGING Database ... Connected!
[DB] Database correctly disconnected.
Shape:  (437, 5) - extract:


Unnamed: 0,pk_gv,names,uris,presences,defs
0,25494,Zurich;Zürich;Zürich CH,,47.366667 8.55,Publikationsort
1,25540,Arezzo,,43.473333 11.87,
2,25639,Leningrad;Sankt Peterburg;Sankt Petersburg,,,
3,25685,Freiburg,,3.0 3.0;47.9990077 7.8421043,
4,25721,Muri,,47.28 8.34;47.283333 8.35,


## Create data

##### Geo place

In [None]:
selection = table[['pk_gv', 'pk_bhp']].drop_duplicates('pk_bhp')
selection = selection[pd.isna(selection['pk_gv'])].copy()

selection['pk_gv'] = db.resources.create(pks.classes.geoPlace, len(selection))

table = table.merge(selection, on='pk_bhp', how='left')
table['pk_gv'] = [row['pk_gv_x'] if pd.notna(row['pk_gv_x']) else row['pk_gv_left'] for _,row in table.iterrows()]
table.drop(columns=['pk_gv_x', 'pk_gv_y'], inplace=True)

In [None]:
# Verification: all geo places has been created?
assert len(table[pd.isna(table['pk_gv'])]) == 0

##### Geo place Kind

In [None]:
# Here we do not need to worry if the geo place already has a kind or not: v_statement handles it

selection = table[['pk_gv', 'pk_kind']].dropna('kind')
db.statements.create(selection['pk_gv'], pks.properties., selection['pk_kind']) # HAS KIND

##### Presence

In [52]:
selection = table[pd.notna(table['lat'])]

# Case 1: on geovistory, there is no presence ==> create a new presence
# Case 2: on geovistory, there are presences but none fit the coordinates ==> create a new presence
# Case 4: on geovistory, we found an existing presence which fit the BHP one. ==> Just add the presence to the project
existing = selection.merge(existing_gv)
existing['new_presence'] = existing['lat'].astype(str) + ' ' + existing['lng'].astype(str)
existing_presence = existing[[row['new_presence'] in row['presences'] for _,row in existing.iterrows()]].drop_duplicates(['pk_bhp', 'pk_gv'])


##### Definitions

##### Comments