In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
env = 'staging'
pk_project = 153
debug = True

import pandas as pd
import duckdb
import threading
from icecream import ic

import geovpylib.utils as u
import geovpylib.database as db
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.graphs as graphs
import geovpylib.find as find

eta = u.Eta()
ic.configureOutput(prefix='debug| ')
if debug: ic.enable()
else: ic.disable()


db.connect(env, pk_project, execute=False)

## HLS humans found in wikidata

### Fetch data

In [None]:
sparql.init('https://query.wikidata.org/sparql')
wikidata = sparql.query("""
    SELECT ?uri_hls ?uri_wikidataLabel ?uri_wikidata ?genderLabel ?birthdate ?deathdate ?description
    WHERE {
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P31 wd:Q5 .
        optional { ?uri_wikidata wdt:P21 ?gender . }
        optional { ?uri_wikidata wdt:P569 ?birthdate . }
        optional { ?uri_wikidata wdt:P570 ?deathdate . }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . 
            ?uri_wikidata schema:description ?description .
        }
    }
""")
                        
wikidata.rename(columns={'uri_wikidataLabel': 'name', 'genderLabel': 'gender'}, inplace=True)
wikidata['uri_hls'] = 'https://hls-dhs-dss.ch/articles/' + wikidata['uri_hls']

wikidata = wikidata[['name', 'description', 'uri_wikidata', 'uri_hls', 'gender', 'birthdate', 'deathdate']]
wikidata['gender'] = wikidata['gender'].replace('male', 'Male')
wikidata['gender'] = wikidata['gender'].replace('female', 'Female')
wikidata['birthdate'] = [u.parse_date(strdate) for strdate in wikidata['birthdate']]
wikidata['deathdate'] = [u.parse_date(strdate) for strdate in wikidata['deathdate']]

u.infos(wikidata, random=True)

### Find existing persons in Geovistory

In [None]:
display(find.find_persons_by_uri(wikidata['uri_wikidata']))
display(find.find_persons_by_uri(wikidata['uri_hls']))

In [None]:
similars = find.find_persons(wikidata, index_col_name='uri_hls', jobs=10)
u.infos(similars)

wikidata = wikidata.merge(similars[['uri_hls', 'pk_entity']].drop_duplicates(), on='uri_hls', how='left').rename(columns={'pk_entity':'pk_person'})
u.infos(wikidata)

In [None]:
to_update = wikidata[pd.notna(wikidata['pk_person'])]
to_create = wikidata[pd.isna(wikidata['pk_person'])]

## Insert data

### Create new persons

In [None]:
to_create['pk_person'] = db.resources.create(pks.classes.person, len(to_create))

#### Definitions

In [None]:
selection = to_create[['pk_person', 'description']].dropna()

graphs.add_definitions(
    selection['pk_person'].tolist(),
    selection['description'].tolist(),
    pks.languages.english
)

#### URIs

In [None]:
# Wikidata
selection = to_create[['pk_person', 'uri_wikidata']].dropna()

graphs.add_uris(
    selection['pk_person'].tolist(), 
    selection['uri_wikidata'].tolist()
)

In [None]:
# HLS
selection = to_create[['pk_person', 'uri_hls']].dropna()

# Execute
graphs.add_uris(
    selection['pk_person'].tolist(),
    selection['uri_hls'].tolist()
)

#### Gender

In [None]:
# Prepare
selection = to_create[['pk_person', 'gender']].dropna()

z = db.statements.create(
    selection['pk_person'].tolist(),
    pks.properties.person_hasGender_gender,
    selection['gender'].replace('Male', pks.entities.pk_gender_male).replace('Female', pks.entities.pk_gender_female)
)

#### Birthdate

In [None]:
selection = to_create[['pk_person', 'birthdate']].dropna()
years = [d[0] for d in selection['birthdate']]
months = [d[1] for d in selection['birthdate']]
days = [d[2] for d in selection['birthdate']]

pk_births = db.resources.create(pks.classes.birth, len(selection))
pk_time_prim = db.time_primitives.create(years, months, days, '1 day')

z = db.statements.create(
    pk_births, 
    pks.properties.birth_broughtIntoLife_person,
    selection['pk_person'].tolist()
)

z = db.statements.create(
    pk_births, 
    pks.properties.timeSpan_atSomeTimeWithin_timePrimitive,
    pk_time_prim
)

#### Deathdate

In [None]:
selection = to_create[['pk_person', 'deathdate']].dropna()
years = [d[0] for d in selection['deathdate']]
months = [d[1] for d in selection['deathdate']]
days = [d[2] for d in selection['deathdate']]

pk_deaths = db.resources.create(pks.classes.death, len(selection))
pk_time_prim = db.time_primitives.create(years, months, days, '1 day')

z = db.statements.create(
    pk_deaths, 
    pks.properties.death_wasDeathOf_person,
    selection['pk_person'].tolist()
)

z = db.statements.create(
    pk_deaths, 
    pks.properties.timeSpan_atSomeTimeWithin_timePrimitive,
    pk_time_prim
)