In [2]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
import datetime
#import json
import requests
#import duckdb
#import plotly.express as px

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.record_linkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Specific imports
# ...

# Global variables
# ...

# Connect to Geovistory database
# env = 'prod' # Database to query: "prod", "stag", "dev", "local"
# pk_project = pks.projects. # The project to query/insert: integer
# execute = False # Boolean to prevent to execute directly into databases
# metadata_str = '' # kebab-lower-case or snake-lower-case. 
# import_manner = 'one-shot' # 'one-shot' or 'batch'
# db.connect_geovistory(env, pk_project, execute)
# db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
# db.set_insert_manner(import_manner)

# Connect to other database
# db_url_env_var_name = 'YELLOW-' # Name of an environment variable holding the Postgres database URL
# execute = False # Boolean to prevent to execute directly into databases
# db.connect_external(os.getenv(db_url_env_var_name), execute=False)

# Connect to Wikidata SPARQL endpoint
sparql.connect_external('https://query.wikidata.org/sparql')

>> External SPARQL URL set to <https://query.wikidata.org/sparql>


# Import HLS data

## 1/ Harvest data

### 1.1/ Wikidata

In [None]:
hls_persons = sparql.query("""
    SELECT ?hls_id ?uri_wikidataLabel ?uri_wikidata ?genderLabel ?birthdate ?birthplace ?deathdate ?deathplace ?description
    WHERE {
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P902 ?hls_id .
        optional { ?uri_wikidata wdt:P21 ?gender . }
        optional { ?uri_wikidata wdt:P569 ?birthdate . }
        optional { ?uri_wikidata wdt:P19 ?birthplace . }
        optional { ?uri_wikidata wdt:P570 ?deathdate . }
        optional { ?uri_wikidata wdt:P20 ?deathplace . }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . 
            ?uri_wikidata schema:description ?description .
        }
    }
""")
                                
hls_persons.rename(columns={'uri_wikidataLabel': 'name', 'genderLabel': 'gender'}, inplace=True)
hls_persons['uris'] = 'https://hls-dhs-dss.ch/articles/' + hls_persons['hls_id'] + ' ; ' + hls_persons['uri_wikidata']
hls_persons = hls_persons[['name', 'gender', 'description', 'birthdate', 'birthplace', 'deathdate', 'deathplace', 'hls_id', 'uris']]
hls_persons['gender'] = hls_persons['gender'].replace('male', 'Male')
hls_persons['gender'] = hls_persons['gender'].replace('female', 'Female')
hls_persons['birthdate'] = [u.parse_date(strdate) for strdate in hls_persons['birthdate']]
hls_persons['deathdate'] = [u.parse_date(strdate) for strdate in hls_persons['deathdate']]

a.infos(hls_persons, random=True)

# 44s

In [None]:
hls_birthplaces = sparql.query("""
    SELECT ?place ?placeLabel ?coordplace
    WHERE {
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P19 ?place .
        optional { ?place wdt:P625 ?coordplace . }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
""")                    
# a.infos(hls_birthplaces)  

hls_deathplaces = sparql.query("""
    SELECT ?place ?placeLabel ?coordplace
    WHERE {
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P20 ?place .
        optional { ?place wdt:P625 ?coordplace . }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
""")
# a.infos(hls_deathplaces)

hls_places = pd.concat([hls_birthplaces, hls_deathplaces]).drop_duplicates()
hls_places['lat'] = [point.replace('Point(', '').replace(')', '').split(' ')[1] if pd.notna(point) else pd.NA for point in hls_places['coordplace']]
hls_places['lng'] = [point.replace('Point(', '').replace(')', '').split(' ')[0] if pd.notna(point) else pd.NA for point in hls_places['coordplace']]
hls_places['lat'] = hls_places['lat'].astype(pd.Float64Dtype())
hls_places['lng'] = hls_places['lng'].astype(pd.Float64Dtype())
hls_places.rename(columns={'place':'uri', 'placeLabel':'name'}, inplace=True)
hls_places['kind'] = 'Settlement'
hls_places = hls_places[['name','kind','lat','lng','uri']]
a.infos(hls_places)

# 14s

### 1.2/ Fetch additional persons URIs from `metagrid.ch`

In [None]:
def get_meta_grid_uris(name, clues):
    """Given the name and some clues (to differentiate between entities that have the same name), fetch the other uris that metagrid has on an entity."""

    url = "https://api.metagrid.ch/search"
    response = requests.get(url, params={"group":1, "query":name}).json()

    if 'concordances' not in response: return []

    for concordance in response['concordances']:
        uris = []
        found = False
        for resource in concordance['resources']:
            uri = resource['link']['uri']
            clues_present = True
            for clue in clues:
                if uri.find(clue) == -1: clues_present = False
            if clues_present: found = True
            uris.append(uri)
        if found: return uris

    return []

# Test
# get_meta_grid_uris('Camille Guggenheim', ['hls-dhs-dss.ch', '043800'])

In [None]:
%%cache_it hls_persons

# hls_persons = wikidata.copy()
hls_persons['hls_id'] = hls_persons['hls_id'].astype(str)
hls_persons['same_as'] = pd.NA

eta.begin(len(hls_persons), 'Fetching additional URIs')
for i, row in hls_persons.iterrows():
    uris = get_meta_grid_uris(row['name'], ['hls-dhs-dss.ch', row['hls_id']])
    hls_persons.at[i, "same_as"] = ";".join(uris)
    eta.iter()
eta.end()

hls_persons.drop(columns=['uri_wikidata', 'uri_hls', 'hls_id'], inplace=True)

a.infos(hls_persons)

# 50min

### 1.3/ Geovistory data

In [None]:
db.connect_geovistory('prod')

In [20]:
gv_persons = db.query(f"""
    select distinct
        r0.pk_entity as pk_person,
        a3.string as uri
    from information.resource r0 
    inner join projects.info_proj_rel ipr0 on ipr0.fk_entity = r0.pk_entity and ipr0.is_in_project = true
    inner join information.statement s1 on s1.fk_subject_info = r0.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1 on ipr1.fk_entity = s1.pk_entity and ipr1.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r0.fk_class = {pks.classes.person}                   
""")

a.infos(gv_persons)

# 17s

In [None]:
persons

## 2/ Record linkage

### 2.1/ Persons

In [None]:
a.infos(hls_persons[pd.notna(hls_persons.pk_gv)])

In [None]:
%%cache_it hls_persons

hls_persons['pk_gv'] = pd.NA

eta.begin(len(hls_persons), 'HLS/GV record linkage')
for i, row in hls_persons.iterrows():
    uris = row['same_as'].split(';')
    for uri in uris:
        selection = gv_persons[gv_persons['uri'] == uri]
        if len(selection) == 1:
            hls_persons.at[i, 'pk_gv'] = selection.iloc[0]['pk_person']
        if len(selection) > 1: 
            eta.print(f'Multiple matches found for uri {uri}')
    eta.iter()
eta.end()

### 2.2/ Geographical places

In [None]:
q.find

In [None]:
rl.find_geo_places(hls_places)

## 3/ Work with the geographical places

In [None]:
geo_places = hls_persons['birthplace'].dropna().tolist() + hls_persons['deathplace'].dropna().tolist()
geo_places = np.unique(geo_places)
geo_places = [uri[uri.rindex('/') + 1:] for uri in geo_places]
geo_places = list(filter(lambda uri: uri.startswith('Q'), geo_places))

print('Geographical places number:', len(geo_places))

In [None]:
filter_str = '?s = wd:' + ' || ?s = wd:'.join(geo_places[:100])
# filter_str = '?s = wd:Q48958 || ?s = wd:Q100123'

wikidata_geo_places = sparql.query("""
    SELECT ?s ?sLabel ?coord
    WHERE {
        ?s wdt:P625 ?coord .
                                   
        filter(""" + filter_str + """)

        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
""")
wikidata_geo_places.rename(columns={'sLabel': 'name'}, inplace=True)
wikidata_geo_places['lat'] = [point.replace('Point(', '').replace(')', '').split(' ')[1] for point in wikidata_geo_places['coord']]
wikidata_geo_places['lng'] = [point.replace('Point(', '').replace(')', '').split(' ')[0] for point in wikidata_geo_places['coord']]
wikidata_geo_places.drop(columns=['coord'], inplace=True)

wikidata_geo_places