# Import Births and Deaths Places

In [1]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
import datetime
#import time
#import json
#import requests
#import duckdb
#import plotly.express as px
# from multiprocessing import Pool

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.importer as i
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.record_linkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Connect to Geovistory database read mode
# db.connect_geovistory('prod')

# Connect to Geovistory database for insert
env = 'prod' # Database to query: "prod", "stag", "dev", "local"
pk_project = pks.projects.switzerland_and_beyond # The project to query/insert: integer
execute = True # Boolean to prevent to execute directly into databases
metadata_str = 'import-geo-places' # kebab-lower-case or snake-lower-case. 
import_manner = 'one-shot' # 'one-shot' or 'batch'

# Connect to other database
# db_url_env_var_name = 'YELLOW_' # Name of an environment variable holding the Postgres database URL
# execute = False # Boolean to prevent to execute directly into databases
# db.connect_external(os.getenv(db_url_env_var_name), execute=False)

# Connect to a SPARQL endpoint
# sparql.connect_external('url')


# Get infos from wikidata

In [2]:
sparql.connect_external('https://query.wikidata.org/sparql')

births = sparql.query("""
    SELECT ?uri_wikidata_person ?hls_id ?uri_wikidata_place ?uri_wikidata_placeLabel ?placeCoordinates ?placeDescription ?place_classLabel
    WHERE {
        ?uri_wikidata_person wdt:P31 wd:Q5 .
        ?uri_wikidata_person wdt:P902 ?hls_id .
        ?uri_wikidata_person wdt:P19 ?uri_wikidata_place .
        ?uri_wikidata_place wdt:P31 ?place_class .
        optional { ?uri_wikidata_place wdt:P625 ?placeCoordinates .}
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
      
        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . 
            ?uri_wikidata_place schema:description ?placeDescription .
        }
    }
""")
u.write_df(births, './births-wd.csv')

deaths = sparql.query("""
    SELECT ?uri_wikidata_person ?hls_id ?uri_wikidata_place ?uri_wikidata_placeLabel ?placeCoordinates ?placeDescription ?place_classLabel
    WHERE {
        ?uri_wikidata_person wdt:P31 wd:Q5 .
        ?uri_wikidata_person wdt:P902 ?hls_id .
        ?uri_wikidata_person wdt:P20 ?uri_wikidata_place .
        ?uri_wikidata_place wdt:P31 ?place_class .
        optional { ?uri_wikidata_place wdt:P625 ?placeCoordinates .}
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
      
        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . 
            ?uri_wikidata_place schema:description ?placeDescription .
        }
    }
""")
u.write_df(births, './deaths-wd.csv')

# 26s


births = u.read_df('./births-wd.csv')
births['hls_id'] = births['hls_id'].astype(pd.Int64Dtype())

deaths = u.read_df('./deaths-wd.csv')
deaths['hls_id'] = deaths['hls_id'].astype(pd.Int64Dtype())

# print(births.shape)
# print(deaths.shape)

>> External SPARQL URL set to <https://query.wikidata.org/sparql>


In [3]:
wd_places = pd.concat([
    births[['uri_wikidata_place', 'placeCoordinates', 'placeDescription', 'uri_wikidata_placeLabel', 'place_classLabel']].copy(),
    deaths[['uri_wikidata_place', 'placeCoordinates', 'placeDescription', 'uri_wikidata_placeLabel', 'place_classLabel']].copy()
]).drop_duplicates()
wd_places.rename(columns={'uri_wikidata_place':'uri', 'placeDescription':'definition', 'uri_wikidata_placeLabel':'name'}, inplace=True)
wd_places['lat'] = [float(coord.split(' ')[1].replace(')', '')) if pd.notna(coord) else pd.NA for coord in wd_places['placeCoordinates']]
wd_places['lng'] = [float(coord.split(' ')[0].replace('Point(', '')) if pd.notna(coord) else pd.NA for coord in wd_places['placeCoordinates']]

def parse_kind(row):

    classLabel = row['place_classLabel']

    if 'settlement' in classLabel: return 'Settlement'
    if 'village' in classLabel: return 'Settlement'
    if 'city' in classLabel: return 'Settlement'
    if 'City' in classLabel: return 'Settlement'
    if 'municipality' in classLabel: return 'Settlement'
    if 'capital' in classLabel: return 'Settlement'
    if 'town' in classLabel: return 'Settlement'
    if 'commune' in classLabel: return 'Settlement'
    if 'populated place' in classLabel: return 'Settlement'
    if 'locality' in classLabel: return 'Settlement'
    if 'hamlet' in classLabel: return 'Settlement'
    if 'Ortschaft' in classLabel: return 'Settlement'
    if 'polis' in classLabel: return 'Settlement'
    if 'principality' in classLabel: return 'Settlement'

    if 'administrative division' in classLabel: return 'Legal territory'
    if 'canton' in classLabel: return 'Legal territory'
    if 'state' in classLabel: return 'Legal territory'
    if 'district' in classLabel: return 'Legal territory'
    if 'Ortsteil' in classLabel: return 'Legal territory'
    if 'Ortsteil' in classLabel: return 'Legal territory'
    if 'quarter' in classLabel: return 'Legal territory'
    if 'country' in classLabel: return 'Legal territory'
    if 'duchy' in classLabel: return 'Legal territory'
    if 'municipal arrondissement' in classLabel: return 'Legal territory'
    if 'uyezd' in classLabel: return 'Legal territory'
    if 'civil parish' in classLabel: return 'Legal territory'
    if 'colonial power' in classLabel: return 'Legal territory'
    if 'colony' in classLabel: return 'Legal territory'
    if 'countship' in classLabel: return 'Legal territory'
    if 'department' in classLabel: return 'Legal territory'
    if 'Ortsbezirk' in classLabel: return 'Legal territory'
    if 'Stadtbezirk' in classLabel: return 'Legal territory'
    if 'ceremonial county ' in classLabel: return 'Legal territory'
    if 'municipal part' in classLabel: return 'Legal territory'
    if 'province' in classLabel: return 'Legal territory'
    if 'republic' in classLabel: return 'Legal territory'
    if 'county' in classLabel: return 'Legal territory'
    if 'territorial entity' in classLabel: return 'Legal territory'
    if 'parish' in classLabel: return 'Legal territory'
    if 'Territory' in classLabel: return 'Legal territory'
    if 'kingdoms' in classLabel: return 'Legal territory'
    if 'realm' in classLabel: return 'Legal territory'
    if 'federative unit of Brazil' in classLabel: return 'Legal territory'
    if 'former arrondissement of Paris' in classLabel: return 'Legal territory'
    if 'governorate' in classLabel: return 'Legal territory'
    if 'nation' in classLabel: return 'Legal territory'
    if 'colonia' in classLabel: return 'Legal territory'

    if 'château' in classLabel: return 'Infrastructure'
    if 'castle' in classLabel: return 'Infrastructure'
    if 'architectural' in classLabel: return 'Infrastructure'
    if 'museum' in classLabel: return 'Infrastructure'
    if 'dog breed' in classLabel: return 'Infrastructure'
    if 'building' in classLabel: return 'Infrastructure'
    if 'rocca' in classLabel: return 'Infrastructure'
    if 'abbey' in classLabel: return 'Infrastructure'
    if 'garden' in classLabel: return 'Infrastructure'
    if 'winery' in classLabel: return 'Infrastructure'
    if 'house' in classLabel: return 'Infrastructure'
    if 'institution' in classLabel: return 'Infrastructure'
    if 'cultural property' in classLabel: return 'Infrastructure'
    if 'local council of Malta' in classLabel: return 'Infrastructure'
    if 'presidential system' in classLabel: return 'Infrastructure'
    if 'presidential system' in classLabel: return 'Infrastructure'

    if 'quartier' in classLabel: return 'Section'
    if 'neighborhood' in classLabel: return 'Section'
    if 'frazione' in classLabel: return 'Section'
    if 'street' in classLabel: return 'Section'

    if 'region' in classLabel: return 'Region'
    if 'Region' in classLabel: return 'Region'
    if 'area' in classLabel: return 'Region'
    if 'community' in classLabel: return 'Region'
    if 'ancient civilization' in classLabel: return 'Region'
    if 'campagne' in classLabel: return 'Region'
    if 'historic site' in classLabel: return 'Region'
    if 'campagne' in classLabel: return 'Region'
    if 'electoral unit' in classLabel: return 'Region'
    if 'geographic location' in classLabel: return 'Region'
    if 'ruins' in classLabel: return 'Region'

    if 'island' in classLabel: return 'Natural element'
    if 'hill' in classLabel: return 'Natural element'
    if 'mountain' in classLabel: return 'Natural element'
    if 'valley' in classLabel: return 'Natural element'


    # Manual kind assignings
    if row['name'] == 'Altona': return 'Settlement'
    if row['name'] == 'Elberfeld': return 'Settlement'
    if row['name'] == 'Rauden': return 'Settlement'
    if row['name'] == 'Shanghai French Concession': return 'Infrastructure'
    if row['name'] == 'Val Bregaglia': return 'Natural element'
    if row['name'] == 'Podgórze': return 'Settlement'
    if row['name'] == 'Auvergne': return 'Region'
    if row['name'] == 'Samiano': return 'Settlement'
    if row['name'] == 'United States of America': return 'Legal territory'
    if row['name'] == 'Val Fex': return 'Natural element'
    if row['name'] == 'Goddelau': return 'Settlement'
    if row['name'] == 'Weilheim': return 'Settlement'
    if row['name'] == 'Sonnborn': return 'Settlement'
    if row['name'] == 'Parish of St Gertrud of Germany': return 'Legal territory'



    return pd.NA

wd_places['kind'] = [parse_kind(row) for i, row in wd_places.iterrows()]

# For each place if it has at least one Settlement kind, we take it as a Settlement
for i, row in wd_places.iterrows():
    kinds = wd_places[wd_places['uri'] == row['uri']]['kind'].dropna().tolist()
    if 'Settlement' in kinds: wd_places.at[i, 'kind'] = 'Settlement'
    
wd_places = wd_places[['name', 'definition', 'lat', 'lng', 'uri', 'place_classLabel', 'kind']].drop_duplicates()


a.infos(wd_places, random=True)

# Left to do
if len(a.group_by_count(wd_places[pd.isna(wd_places['kind'])], 'place_classLabel')) != 0:
    raise Exception('There is places from wikidata with missing Kinds')

Shape:  (5959, 7) - extract:


Unnamed: 0,name,definition,lat,lng,uri,place_classLabel,kind
27795,Kaliningrad,Russian Baltic city between Poland and Lithuania,54.716667,20.5,http://www.wikidata.org/entity/Q1829,big city,Settlement
6183,Untervaz,municipality in Switzerland,46.9275,9.535,http://www.wikidata.org/entity/Q69095,municipality of Switzerland,Settlement
28214,Frankfurt (Oder),"city in Brandenburg, Germany",52.342083,14.551667,http://www.wikidata.org/entity/Q4024,border town,Settlement
38033,Schwabach,"town in Bavaria, Germany",49.329167,11.020833,http://www.wikidata.org/entity/Q14889,urban district of Bavaria,Settlement
273,Näfels,village and former municipality in Glarus Nord...,47.098889,9.062778,http://www.wikidata.org/entity/Q182806,village,Settlement


# Get GV Geographical places

In [4]:
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

[DB] Connecting to PRODUCTION Database ... Connected!


In [5]:
gv_places = db.query(f"""
    select distinct
        r0.pk_entity as pk_geoplace
    from information.resource r0
    inner join projects.info_proj_rel ipr0 on ipr0.fk_entity = r0.pk_entity and ipr0.is_in_project = true
    where r0.fk_class = {pks.classes.geoPlace}     
""")

gv_kind = db.query(f"""
    select distinct
        r0.pk_entity as pk_geoplace,
        s1.fk_object_info as pk_kind,
        s1.pk_entity as pk_stmt_haskind
    from information.resource r0
    inner join information.statement s1 on s1.fk_subject_info = r0.pk_entity and s1.fk_property = {pks.properties.geoPlace_hasKind_geoPlaceKind}
    where r0.fk_class = {pks.classes.geoPlace}     
""")
gv_kind['kind'] = [pks.entities.get_kind_label(pk_kind) for pk_kind in gv_kind['pk_kind']]
gv_kind['to_add_kind'] = [[row['pk_stmt_haskind']] for i, row in gv_kind.iterrows()]
gv_kind.drop(columns=['pk_kind', 'pk_stmt_haskind'], inplace=True)


gv_places_name = db.query(f"""
    select distinct
        r0.pk_entity as pk_geoplace,
        a3.string as name,
        s1.pk_entity as pk_stmt_isaialof,
        s1.fk_subject_info as pk_aial,
        s2.pk_entity as pk_stmt_referstoname
    from information.resource r0
    inner join information.statement s1 on s1.fk_object_info = r0.pk_entity and s1.fk_property = {pks.properties.aial_isAppelationForLanguageOf_entity}
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_subject_info and s2.fk_property = {pks.properties.aial_refersToName_appellation}
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r0.fk_class = {pks.classes.geoPlace}     
""")
gv_places_name['to_add_name'] = [[row['pk_stmt_isaialof'], row['pk_aial'], row['pk_stmt_referstoname']] for i, row in gv_places_name.iterrows()]
gv_places_name.drop(columns=['pk_stmt_isaialof', 'pk_aial', 'pk_stmt_referstoname'], inplace=True)


gv_places_coordinates = db.query(f"""
    select distinct
        r0.pk_entity as pk_geoplace,
        st_y(p5.geo_point::geometry) as lat, st_x(p5.geo_point::geometry) as lng,
        s3.pk_entity as pk_stmt_waspresenceof,
        s3.fk_subject_info as pk_presence,
        s4.pk_entity as pk_stmt_wasat      
    from information.resource r0
    inner join information.statement s3 on s3.fk_object_info = r0.pk_entity and s3.fk_property = {pks.properties.presence_wasPresenceOf_spacetimeVolume}
    inner join information.statement s4 on s4.fk_subject_info = s3.fk_subject_info and s4.fk_property = {pks.properties.presence_wasAt_place}
    inner join information.place p5 on p5.pk_entity = s4.fk_object_info 
    where r0.fk_class = {pks.classes.geoPlace}                     
""")
gv_places_coordinates['to_add_coord'] = [[row['pk_stmt_waspresenceof'], row['pk_presence'], row['pk_stmt_wasat']] for i, row in gv_places_coordinates.iterrows()]
gv_places_coordinates.drop(columns=['pk_stmt_waspresenceof', 'pk_presence', 'pk_stmt_wasat'], inplace=True)


gv_places_uris = db.query(f"""
    select distinct
        r0.pk_entity as pk_geoplace,
        a8.string as uri,
        s6.pk_entity as pk_stmt_sameas,
        s6.fk_object_info as pk_uri,
        s7.pk_entity as pk_stmt_hasvalue      
    from information.resource r0
    inner join information.statement s6 on s6.fk_subject_info = r0.pk_entity and s6.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join information.statement s7 on s7.fk_subject_info = s6.fk_object_info and s7.fk_property = {pks.properties.appe_hasValue_string}
    inner join information.appellation a8 on a8.pk_entity = s7.fk_object_info
    where r0.fk_class = {pks.classes.geoPlace}                      
""")
gv_places_uris['to_add_uri'] = [[row['pk_stmt_sameas'], row['pk_uri'], row['pk_stmt_hasvalue']] for i, row in gv_places_uris.iterrows()]
gv_places_uris.drop(columns=['pk_stmt_sameas', 'pk_uri', 'pk_stmt_hasvalue'], inplace=True)


# We only take those who have a kind! ==> inner join
gv_places = gv_places \
                .merge(gv_kind, how='inner', on='pk_geoplace') \
                .merge(gv_places_name, how='left', on='pk_geoplace') \
                .merge(gv_places_coordinates, how='left', on='pk_geoplace') \
                .merge(gv_places_uris, how='left', on='pk_geoplace')

u.write_df(gv_places, './gv-places.csv')

gv_places = u.read_df('./gv-places.csv')
gv_places['pk_geoplace'] = gv_places['pk_geoplace'].astype(pd.Int64Dtype())
gv_places['lat'] = gv_places['lat'].astype(pd.Float64Dtype())
gv_places['lng'] = gv_places['lng'].astype(pd.Float64Dtype())

a.infos(gv_places)

Shape:  (135364, 10) - extract:


Unnamed: 0,pk_geoplace,kind,to_add_kind,name,to_add_name,lat,lng,to_add_coord,uri,to_add_uri
0,25458,Settlement,[739271],Limburg,"[25465, 25460, 25463]",50.398601,8.079578,"[149729.0, 149727.0, 149730.0]",,
1,25458,Settlement,[739271],Leimburg,"[985255, 985249, 985254]",50.398601,8.079578,"[149729.0, 149727.0, 149730.0]",,
2,25485,Settlement,[739272],Kyburg,"[25492, 25487, 25490]",7.438637,46.951081,"[300508.0, 300506.0, 300509.0]",,
3,25485,Settlement,[739272],Kyburg,"[25492, 25487, 25490]",47.458349,8.743733,"[300508.0, 300506.0, 300523.0]",,
4,25485,Settlement,[739272],Kyburg,"[25492, 25487, 25490]",39.611051,52.512393,"[300508.0, 300506.0, 300519.0]",,


# Place Record linkage

In [6]:
# Kind manual verifications
print(wd_places['kind'].unique().tolist())
print(gv_places['kind'].unique().tolist())

['Settlement', 'Legal territory', 'Region', 'Natural element', 'Infrastructure', 'Section']
['Settlement', <NA>, 'Legal territory', 'Region', 'Section', 'Address', 'Natural element', 'Infrastructure']


In [7]:
record_linkage = []

eta.begin(len(wd_places), 'Finding existing places')
for i, wd_place in wd_places.iterrows():

    # Only same kind (or undefined)
    selection = gv_places[pd.isna(gv_places['kind']) | pd.isna(wd_place['kind']) | (gv_places['kind'] == wd_place['kind'])]

    # Only take those around 5 (more or less, so max: diag of 10km squared: max ~14km) km around (since they already have the same kind, it makes sense)
    selection = selection[pd.isna(wd_place['lat']) | pd.isna(selection['lat'])  | (wd_place['lat'] - 0.05 < selection['lat'])]
    selection = selection[pd.isna(wd_place['lat']) | pd.isna(selection['lat'])  | (selection['lat'] < wd_place['lat'] + 0.05)]
    selection = selection[pd.isna(wd_place['lng']) | pd.isna(selection['lng'])  | (wd_place['lng'] - 0.05 < selection['lng'])]
    selection = selection[pd.isna(wd_place['lng']) | pd.isna(selection['lng'])  | (selection['lng'] < wd_place['lng'] + 0.05)]

    # Shorten selection
    selection = selection[['pk_geoplace', 'name', 'uri']].drop_duplicates()

    # Only those with the same name
    selection = selection[[pd.notna(wd_place['name']) and pd.notna(row['name']) and wd_place['name'] == row['name'] for _, row in selection.iterrows()]]

    # To avoid multipling duplicates, we only take the first who matches
    if len(selection) > 0:
        record_linkage.append({'wd_uri': wd_place['uri'], 'pk_gv': selection.iloc[0]['pk_geoplace']})

    eta.iter()
eta.end()

record_linkage = pd.DataFrame(data=record_linkage)
record_linkage.drop_duplicates(inplace=True)
u.write_df(record_linkage, './places-rl.csv')

a.infos(record_linkage)

# 151m

Finding existing places is done - Elapsed: [00h18m16s]                                                                       
Shape:  (910, 2) - extract:


Unnamed: 0,wd_uri,pk_gv
0,http://www.wikidata.org/entity/Q70,80681
9,http://www.wikidata.org/entity/Q72,25494
13,http://www.wikidata.org/entity/Q78,80974
19,http://www.wikidata.org/entity/Q1034,205648
21,http://www.wikidata.org/entity/Q1309,11134943


In [8]:
record_linkage = u.read_df('./places-rl.csv')
record_linkage['pk_gv'] = record_linkage['pk_gv'].astype(int)

In [9]:
data = wd_places.merge(record_linkage, left_on='uri', right_on='wd_uri', how='left').drop(columns=['wd_uri'])
data['pk_gv'] = data['pk_gv'].astype(pd.Int64Dtype())

a.infos(data)

Shape:  (5959, 8) - extract:


Unnamed: 0,name,definition,lat,lng,uri,place_classLabel,kind,pk_gv
0,Bern,"city in Switzerland, capital of the canton of ...",46.94798,7.44743,http://www.wikidata.org/entity/Q70,city,Settlement,80681
1,Bern,"city in Switzerland, capital of the canton of ...",46.94798,7.44743,http://www.wikidata.org/entity/Q70,capital city,Settlement,80681
2,Bern,"city in Switzerland, capital of the canton of ...",46.94798,7.44743,http://www.wikidata.org/entity/Q70,administrative division,Settlement,80681
3,Bern,"city in Switzerland, capital of the canton of ...",46.94798,7.44743,http://www.wikidata.org/entity/Q70,municipality of Switzerland,Settlement,80681
4,Bern,"city in Switzerland, capital of the canton of ...",46.94798,7.44743,http://www.wikidata.org/entity/Q70,federal city,Settlement,80681


# Find existing Geographical place and add them to the project

In [10]:
existing_places = data[pd.notna(data['pk_gv'])].copy()
existing_places.drop(columns=['place_classLabel'], inplace=True)
existing_places.drop_duplicates(inplace=True)

print('Adding entities...')
db.info_proj_rels.create(existing_places['pk_gv'])

# Names
print('Creating Names...')
selection = existing_places[['pk_gv', 'name']].drop_duplicates().dropna()
db.shortcuts.add_entity_names(selection['pk_gv'], selection['name'], pks.languages.english)

# Definition
print('Creating Definitions...')
selection = existing_places[['pk_gv', 'definition']].drop_duplicates().dropna()
db.shortcuts.add_definitions(selection['pk_gv'], selection['definition'], pks.languages.english)

# Presence
print('Creating Presence...')
selection = existing_places[['pk_gv', 'lat', 'lng']].drop_duplicates().dropna()
points = [(float(row['lat']), float(row['lng'])) for _, row in selection.iterrows()]
db.shortcuts.add_geo_coordinates(selection['pk_gv'], points)

# URI
print('Creating URIs...')
selection = existing_places[['pk_gv', 'uri']].drop_duplicates().dropna()
db.shortcuts.add_uris(selection['pk_gv'], selection['uri'])

Adding entities...
[DB] Creating info_proj_rel of 914 entities with project <153> ... Done in [00h00m01s]
Creating Names...
[DB] Creating 906 resources of class [365] ... Done in [00h00m00s]
[DB] Creating info_proj_rel of 906 entities with project <153> ... Done in [00h00m00s]
[DB] Creating 906 appellations ... Done in [00h00m00s]
[DB] Creating 906 statements ... Updating metadata ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 906 entities with project <153> ... Done in [00h00m00s]
[DB] Creating 906 statements ... Updating metadata ... Done in [00h00m00s]
[DB] Creating info_proj_rel of 906 entities with project <153> ... Done in [00h00m00s]
[DB] Creating 906 statements ... Updating metadata ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 906 entities with project <153> ... Done in [00h00m00s]
Creating Definitions...
[DB] Creating 910 resources of class [899] ... Done in [00h00m00s]
[DB] Creating info_proj_rel of 910 entities with project <153> ... Done in [00h00m01s]
[DB

# Create Geographical places

In [11]:
new_places = data[pd.isna(data['pk_gv'])].copy()
new_places.drop(columns=['pk_gv'], inplace=True)

# Entity
print('Creating entities...')
selection = new_places[['uri']].drop_duplicates().copy()
selection['pk_gv'] = db.resources.create(pks.classes.geoPlace, len(selection))
new_places = new_places.merge(selection, how='left')

# Names
print('Creating Names...')
selection = new_places[['pk_gv', 'name']].drop_duplicates().dropna()
db.shortcuts.add_entity_names(selection['pk_gv'], selection['name'], pks.languages.english)

# Definition
print('Creating Definitions...')
selection = new_places[['pk_gv', 'definition']].drop_duplicates().dropna()
db.shortcuts.add_definitions(selection['pk_gv'], selection['definition'], pks.languages.english)

# Presence
print('Creating Presence...')
selection = new_places[['pk_gv', 'lat', 'lng']].drop_duplicates().dropna()
points = [(float(row['lat']), float(row['lng'])) for _, row in selection.iterrows()]
db.shortcuts.add_geo_coordinates(selection['pk_gv'], points)

# URI
print('Creating URIs...')
selection = new_places[['pk_gv', 'uri']].drop_duplicates().dropna()
db.shortcuts.add_uris(selection['pk_gv'], selection['uri'])

Creating entities...
[DB] Creating 2409 resources of class [363] ... Done in [00h00m00s]
[DB] Creating info_proj_rel of 2409 entities with project <153> ... Done in [00h00m00s]
Creating Names...
[DB] Creating 2409 resources of class [365] ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 2409 entities with project <153> ... Done in [00h00m00s]
[DB] Creating 2409 appellations ... Done in [00h00m01s]
[DB] Creating 2409 statements ... Updating metadata ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 2409 entities with project <153> ... Done in [00h00m00s]
[DB] Creating 2409 statements ... Updating metadata ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 2409 entities with project <153> ... Done in [00h00m01s]
[DB] Creating 2409 statements ... Updating metadata ... Done in [00h00m01s]
[DB] Creating info_proj_rel of 2409 entities with project <153> ... Done in [00h00m00s]
Creating Definitions...
[DB] Creating 2409 resources of class [899] ... Done in [00h00m01s]
[DB] Crea

In [12]:
places = pd.concat([existing_places, new_places])

# Link person to their Birth and Death

In [13]:
# First: get the hld_id <=> place table

births = births[['hls_id', 'uri_wikidata_place']].rename(columns={'uri_wikidata_place':'uri'})
births = births.merge(places, how='left', on='uri')
births = births[['hls_id', 'pk_gv']].rename(columns={'pk_gv':'pk_geoplace'})

deaths = deaths[['hls_id', 'uri_wikidata_place']].rename(columns={'uri_wikidata_place':'uri'})
deaths = deaths.merge(places, how='left', on='uri')
deaths = deaths[['hls_id', 'pk_gv']].rename(columns={'pk_gv':'pk_geoplace'})


# Then: Find the pk_person with this hls id

persons = db.query(f"""
    select 
        r0.pk_entity as pk_person,
        a3.string as uri,
        s3.fk_subject_info as pk_birth,
        s4.fk_subject_info as pk_death
    from information.resource r0
    inner join projects.info_proj_rel ipr0 on ipr0.fk_entity = r0.pk_entity and ipr0.fk_project = {pk_project} and ipr0.is_in_project = true
    inner join information.statement s1 on s1.fk_subject_info = r0.pk_entity and s1.fk_property = {pks.properties.entity_sameAsExternalIdentifier_identifier}
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    -- birth
    left join information.statement s3 on s3.fk_object_info = r0.pk_entity and s3.fk_property = {pks.properties.birth_broughtIntoLife_person}
    -- death
    left join information.statement s4 on s4.fk_object_info = r0.pk_entity and s4.fk_property = {pks.properties.death_wasDeathOf_person}
    where r0.fk_class = {pks.classes.person}
""")

persons = persons[persons['uri'].str.contains('https://hls')]
persons['hls_id'] = persons['uri'].str.replace('https://hls-dhs-dss.ch/articles/', '')
persons['hls_id'] = persons['hls_id'].str.replace('https://hls-dhs-dss.ch/de/articles/009053/2015-11-18/', '009053')
persons['hls_id'] = persons['hls_id'].astype(int)
persons['pk_birth'] = persons['pk_birth'].astype(pd.Int64Dtype())
persons['pk_death'] = persons['pk_death'].astype(pd.Int64Dtype())
persons.drop(columns=['uri'], inplace=True)

persons = persons.merge(births, how='left').rename(columns={'pk_geo_place':'pk_geoplace_birth'})
persons = persons.merge(deaths, how='left').rename(columns={'pk_geo_place':'pk_geoplace_death'})

persons.drop(columns=['pk_person', 'hls_id'], inplace=True)

a.infos(persons)

Shape:  (4074976, 3) - extract:


Unnamed: 0,pk_birth,pk_death,pk_geoplace
0,68429,7777816,80974
1,68429,7777816,80974
2,68429,7777816,80974
3,68429,7777816,80974
4,68429,7777816,80974


In [14]:
# Link births to their place
selection = persons[['pk_birth', 'pk_geoplace']].drop_duplicates().dropna()
db.statements.create(selection['pk_birth'], pks.properties.period_tookPlaceOnOrWithin_phyThing, selection['pk_geoplace'])

# Link deaths to their place
selection = persons[['pk_death', 'pk_geoplace']].drop_duplicates().dropna()
db.statements.create(selection['pk_death'], pks.properties.period_tookPlaceOnOrWithin_phyThing, selection['pk_geoplace'])

[DB] Creating 13469 statements ... Updating metadata ... Done in [00h00m06s]
[DB] Creating info_proj_rel of 13469 entities with project <153> ... Done in [00h00m02s]
[DB] Creating 13347 statements ... Updating metadata ... Done in [00h00m06s]
[DB] Creating info_proj_rel of 13347 entities with project <153> ... Done in [00h00m03s]


---