# Documentation

The goal of this notebook is to fetch all HLS persons (with birth and death) that exist on Wikidata: on Wikidata, there is a property ([`wdt:P902`](https://www.wikidata.org/wiki/Property_talk:P902)) that allow us to know if a person is in the HLS dataset, so we use this property to fetch all HLS persons from Wikidata SPARQL endpoint, make a record linkage across Geovistory existing persons **using the URIs, only**. To enhance the record linkage, we also take URIs from `metagrid.ch`, knowing that the HLS is also referenced there.

And finally, we import all the things found: persons, gender, definitions, their birth, their death and their URIs.

In [29]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import pandas as pd
import datetime
import requests
from multiprocessing import Pool

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Connect to Geovistory database
env = 'prod' 
pk_project = pks.projects.switzerland_and_beyond 
execute = True
metadata_str = 'import-hls-persons' 
import_manner = 'one-shot'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
# Workaround for vscode to recognize variables (otherwise, when using magics vscode does not recognize them)
hls_persons = pd.DataFrame()

# Import HLS persons

## 1./ Harvest data

### 1.1/ Wikidata

In [28]:
%%cache_it hls_persons_wikidata

sparql.connect_external('https://query.wikidata.org/sparql')

hls_persons_wikidata = sparql.query("""
    SELECT ?hls_id ?uri_wikidataLabel ?uri_wikidata ?genderLabel ?description ?birthdate ?deathdate
    WHERE {
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P902 ?hls_id .
        optional { ?uri_wikidata wdt:P21 ?gender . }
        optional { ?uri_wikidata wdt:P569 ?birthdate . }
        optional { ?uri_wikidata wdt:P570 ?deathdate . }
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }

        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . 
            ?uri_wikidata schema:description ?description .
        }
    }
""")
                                
hls_persons_wikidata.rename(columns={'uri_wikidataLabel': 'name', 'genderLabel': 'gender', 'description': 'definition'}, inplace=True)
hls_persons_wikidata['uri_hls'] = 'https://hls-dhs-dss.ch/articles/' + hls_persons_wikidata['hls_id']
hls_persons_wikidata = hls_persons_wikidata[['hls_id', 'name', 'gender', 'definition', 'birthdate', 'deathdate', 'uri_wikidata', 'uri_hls']]
hls_persons_wikidata['gender'] = hls_persons_wikidata['gender'].replace('male', 'Male')
hls_persons_wikidata['gender'] = hls_persons_wikidata['gender'].replace('female', 'Female')
hls_persons_wikidata['birthdate'] = [u.parse_date(date_str) for date_str in hls_persons_wikidata['birthdate']]
hls_persons_wikidata['deathdate'] = [u.parse_date(date_str) for date_str in hls_persons_wikidata['deathdate']]

a.infos(hls_persons_wikidata, random=True)
hls_persons = hls_persons_wikidata

# 36s

[CACHE] Creation at .
>> External SPARQL URL set to <https://query.wikidata.org/sparql>
Shape:  (25319, 8) - extract:


Unnamed: 0,hls_id,name,gender,definition,birthdate,deathdate,uri_wikidata,uri_hls
21447,14467,Jost Pfyffer,Male,(1531-1610),"(1531, 1, 1)","(1610, 3, 10)",http://www.wikidata.org/entity/Q94808367,https://hls-dhs-dss.ch/articles/014467
24490,49655,Konrad Waldkirch,Male,Swiss publisher,"(1549, 5, 15)","(1616, 1, 1)",http://www.wikidata.org/entity/Q18132220,https://hls-dhs-dss.ch/articles/049655
195,34206,Oscar Peer,Male,Swiss dramatist and playwright (1928-2013),"(1928, 4, 23)","(2013, 12, 22)",http://www.wikidata.org/entity/Q122057,https://hls-dhs-dss.ch/articles/034206
12300,25947,Johann Joachim Girtanner,Male,(1745-1800),"(1745, 5, 23)","(1800, 2, 20)",http://www.wikidata.org/entity/Q78062454,https://hls-dhs-dss.ch/articles/025947
13736,6203,Alois Bommer,Male,swiss politician (1913-1993),"(1913, 3, 3)","(1993, 2, 15)",http://www.wikidata.org/entity/Q99214354,https://hls-dhs-dss.ch/articles/006203


[CACHE] Cell has been executed, and result put in the cache


### 1.2/ metagrid.ch

In [30]:
def get_meta_grid_uris(args):
    """Given the name and some clues (to differentiate between entities that have the same name), fetch the other uris that metagrid has on an entity."""
    name, clues = args

    url = "https://api.metagrid.ch/search"
    response = requests.get(url, params={"group":1, "query":name}).json()

    if 'concordances' not in response: return pd.NA

    for concordance in response['concordances']:
        uris = []
        found = False
        for resource in concordance['resources']:
            uri = resource['link']['uri']
            clues_present = True
            for clue in clues:
                if uri.find(clue) == -1: clues_present = False
            if clues_present: found = True
            uris.append(uri)
        if found: return ' ; '.join(uris)

    return pd.NA

# Test
# get_meta_grid_uris(('Camille Guggenheim', ['hls-dhs-dss.ch', '043800']))

In [31]:
%%cache_it hls_persons_metagrid

hls_persons_metagrid = hls_persons

# Find all URIs listed in metagrid (with parallelization)
with Pool(13) as p:
    iterrable = [(row['name'], ['hls-dhs-dss.ch', str(row['hls_id'])]) for _,row in hls_persons_metagrid.iterrows()]
    hls_persons_metagrid['metagrid_uris'] = p.map(get_meta_grid_uris, iterrable)


# 6m

[CACHE] Existing at .


[CACHE] Cell has been executed, and result put in the cache


In [32]:
hls_persons = hls_persons_metagrid

hls_persons['uri_idref'] = pd.NA
hls_persons['uri_viaf'] = pd.NA

eta.begin(len(hls_persons), 'Finding idref and viaf uris')
for i, row in hls_persons.iterrows():
    if pd.isna(row['metagrid_uris']): continue
    uris = row['metagrid_uris'].split(' ; ')
    for uri in uris:
        if 'idref' in uri: hls_persons.at[i, 'uri_idref'] = uri
        if 'viaf' in uri: hls_persons.at[i, 'uri_viaf'] = uri
    eta.iter()
eta.end()

hls_persons.drop(columns=['metagrid_uris'], inplace=True)
hls_persons.drop_duplicates(subset=['hls_id', 'name', 'gender', 'definition', 'uri_wikidata', 'uri_hls', 'uri_idref', 'uri_viaf'], inplace=True)

a.infos(hls_persons)

Finding idref and viaf uris is done - Elapsed: [00h00m01s]                                                                         
Shape:  (23976, 10) - extract:


Unnamed: 0,hls_id,name,gender,definition,birthdate,deathdate,uri_wikidata,uri_hls,uri_idref,uri_viaf
0,28377,Anatoly Lunacharsky,Male,Russian Marxist revolutionary (1875-1933),"(1875, 11, 23)","(1933, 12, 26)",http://www.wikidata.org/entity/Q18809,https://hls-dhs-dss.ch/articles/028377,http://www.idref.fr/035498730/id,http://viaf.org/viaf/74003029
1,44295,Louis de Rougemont,Male,explorer with false claims (1847–1921),"(1847, 11, 12)","(1921, 6, 9)",http://www.wikidata.org/entity/Q20014,https://hls-dhs-dss.ch/articles/044295,http://www.idref.fr/243997825/id,http://viaf.org/viaf/62631835
2,28424,Élisée Reclus,Male,French geographer and writer,"(1830, 3, 15)","(1905, 7, 4)",http://www.wikidata.org/entity/Q20951,https://hls-dhs-dss.ch/articles/028424,http://www.idref.fr/027092321/id,http://viaf.org/viaf/71396743
3,11946,Hermann Hesse,Male,German writer (1877–1962),"(1877, 7, 2)","(1962, 8, 9)",http://www.wikidata.org/entity/Q25973,https://hls-dhs-dss.ch/articles/011946,http://www.idref.fr/028345614/id,http://viaf.org/viaf/41841418
4,12028,Paul Klee,Male,Swiss artist (1879-1940),"(1879, 12, 18)","(1940, 6, 29)",http://www.wikidata.org/entity/Q44007,https://hls-dhs-dss.ch/articles/012028,http://www.idref.fr/026950480/id,http://viaf.org/viaf/68931085


### 1.3/ Geovistory persons with URIs

In [33]:
db.connect_geovistory(env, pk_project, execute)
db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
db.set_insert_manner(import_manner)

[DB] Connecting to PRODUCTION Database ... Connected!


In [34]:
gv_persons = db.query(f"""
    select distinct
        r0.pk_entity as pk_person,
        a3.string as uri
    from information.resource r0
    inner join projects.info_proj_rel ipr0 on ipr0.fk_entity = r0.pk_entity and ipr0.is_in_project = true
    inner join information.statement s1 on s1.fk_subject_info = r0.pk_entity and s1.fk_property = {pks.properties.entity_sameAsURI_URI}
    inner join projects.info_proj_rel ipr1a on ipr1a.fk_entity = s1.pk_entity and ipr1a.is_in_project = true
    inner join projects.info_proj_rel ipr1b on ipr1b.fk_entity = s1.fk_object_info and ipr1b.is_in_project = true
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = {pks.properties.appe_hasValue_string}
    inner join projects.info_proj_rel ipr2 on ipr2.fk_entity = s2.pk_entity and ipr2.is_in_project = true
    inner join information.appellation a3 on a3.pk_entity = s2.fk_object_info
    where r0.fk_class = {pks.classes.person}                   
""")

a.infos(gv_persons)

# 4s

Shape:  (91798, 2) - extract:


Unnamed: 0,pk_person,uri
0,25503,http://symogih.org/resource/Actr385
1,25892,http://dbpedia.org/resource/Quirino_Gasparini
2,25892,http://d-nb.info/gnd/123721644
3,25892,http://viaf.org/viaf/19958814
4,25892,http://www.wikidata.org/entity/Q973108


## 2./ Record linkage

## 2.1 The record linkage itself

In [35]:
%%cache_it record_linkage

record_linkage = []
eta.begin(len(hls_persons), 'Finding same persons')

for i, person in hls_persons.iterrows():
    selection = gv_persons[
            (gv_persons['uri'] == person['uri_wikidata']) |
            (gv_persons['uri'] == person['uri_hls']) |
            (gv_persons['uri'] == person['uri_idref']) |
            (gv_persons['uri'] == person['uri_viaf'])
        ]
    for _, gv_person in selection.iterrows():
        record_linkage.append({'hls_id': person['hls_id'], 'pk_person':gv_person['pk_person']})

    eta.iter()
eta.end()

record_linkage = pd.DataFrame(data=record_linkage).drop_duplicates()
a.infos(record_linkage)

# 4m54s

[CACHE] Existing at .
Finding same persons is done - Elapsed: [00h06m21s]                                                                                
Shape:  (650, 2) - extract:


Unnamed: 0,hls_id,pk_person
0,27786,26856
2,32001,26208
4,17075,26020
6,26027,785542
7,30003,26206


[CACHE] Cell has been executed, and result put in the cache


In [36]:
record_linkage = record_linkage

## 2.2. Create the report file, for validation

In [37]:
# Find namespaces

def uri_namespace(uri):
    """Get the namespace of the given URI."""

    temp = uri.replace('http://', '').replace('https://', '').replace('www.', '')
    return temp[0:temp.index('/')] if '/' in temp else temp

gv_persons['namespace'] = [uri_namespace(uri) for uri in gv_persons['uri']]
selected_namespaces = ['viaf.org', 'wikidata.org', 'idref.fr']

In [38]:
gv_persons_reworked = []

eta.begin(len(gv_persons.pk_person.unique()), 'Reworking Geovistory URIs')
for pk in gv_persons.pk_person.unique():
    selection = gv_persons[gv_persons['pk_person'] == pk]
    selection = selection[[uri_namespace(uri) in selected_namespaces for uri in selection['uri']]]

    if len(selection) == 0: 
        eta.iter()
        continue

    viaf = selection[selection['namespace'] == 'viaf.org']
    wikidata = selection[selection['namespace'] == 'wikidata.org']
    idref = selection[selection['namespace'] == 'idref.fr']

    gv_persons_reworked.append({
        "pk_person": pk,
        "gv_uri_viaf": viaf.iloc[0]['uri'] if len(viaf) != 0 else pd.NA,
        "gv_uri_wikidata": wikidata.iloc[0]['uri'] if len(wikidata) != 0 else pd.NA,
        "gv_uri_idref": idref.iloc[0]['uri'] if len(idref) != 0 else pd.NA
    })

    eta.iter()
eta.end()

gv_persons_reworked = pd.DataFrame(data=gv_persons_reworked)

Reworking Geovistory URIs is done - Elapsed: [00h00m39s]                                                                           


In [39]:
# Record linkage report
record_linkage_report = record_linkage.merge(hls_persons[['hls_id', 'name', 'definition', 'birthdate', 'uri_wikidata', 'uri_hls', 'uri_idref', 'uri_viaf']]).rename(columns={'uri_wikidata':'new_uri_wikidata', 'uri_hls':'uri_hls', 'uri_idref':'new_uri_idref','uri_viaf':'new_uri_viaf'})
record_linkage_report = record_linkage_report.merge(gv_persons_reworked)
record_linkage_report = record_linkage_report[['uri_hls', 'name', 'definition', 'birthdate', 'new_uri_wikidata', 'gv_uri_wikidata', 'new_uri_idref', 'gv_uri_idref', 'new_uri_viaf', 'gv_uri_viaf']]

u.write_df(record_linkage_report, 'hls-person-record-linkage.csv')

## 3./ Import

### 3.1/ Preparation

In [40]:
# List existing URIs for record linkage foundings
for i, row in record_linkage.iterrows():
    selection = gv_persons[gv_persons['pk_person'] == row['pk_person']]
    uris = ' '.join(selection['uri'].tolist())
    record_linkage.at[i, 'existing_uris'] = uris

In [41]:
# We need to mark those who already are in the switzerland and beyond project

pk_persons = record_linkage['pk_person'].unique()

already_in_project = db.query(f"""
    select
        ipr.fk_entity as pk_person
    from projects.info_proj_rel ipr
    where ipr.fk_entity in {u.get_sql_ready_str(pk_persons)} and ipr.fk_project = {pks.projects.switzerland_and_beyond}
""")['pk_person'].astype(int).tolist()

for i, row in record_linkage.iterrows():
    record_linkage.at[i, 'already_in_project'] = row['pk_person'] in already_in_project

In [42]:
# Prepare the working table for imports
table = hls_persons.copy().merge(record_linkage, how='left')
table['pk_person'] = table['pk_person'].astype(pd.Int64Dtype())
table['already_in_project'].fillna(False, inplace=True)

a.infos(table)

Shape:  (23980, 13) - extract:


Unnamed: 0,hls_id,name,gender,definition,birthdate,deathdate,uri_wikidata,uri_hls,uri_idref,uri_viaf,pk_person,existing_uris,already_in_project
0,28377,Anatoly Lunacharsky,Male,Russian Marxist revolutionary (1875-1933),"(1875, 11, 23)","(1933, 12, 26)",http://www.wikidata.org/entity/Q18809,https://hls-dhs-dss.ch/articles/028377,http://www.idref.fr/035498730/id,http://viaf.org/viaf/74003029,,,False
1,44295,Louis de Rougemont,Male,explorer with false claims (1847–1921),"(1847, 11, 12)","(1921, 6, 9)",http://www.wikidata.org/entity/Q20014,https://hls-dhs-dss.ch/articles/044295,http://www.idref.fr/243997825/id,http://viaf.org/viaf/62631835,,,False
2,28424,Élisée Reclus,Male,French geographer and writer,"(1830, 3, 15)","(1905, 7, 4)",http://www.wikidata.org/entity/Q20951,https://hls-dhs-dss.ch/articles/028424,http://www.idref.fr/027092321/id,http://viaf.org/viaf/71396743,,,False
3,11946,Hermann Hesse,Male,German writer (1877–1962),"(1877, 7, 2)","(1962, 8, 9)",http://www.wikidata.org/entity/Q25973,https://hls-dhs-dss.ch/articles/011946,http://www.idref.fr/028345614/id,http://viaf.org/viaf/41841418,,,False
4,12028,Paul Klee,Male,Swiss artist (1879-1940),"(1879, 12, 18)","(1940, 6, 29)",http://www.wikidata.org/entity/Q44007,https://hls-dhs-dss.ch/articles/012028,http://www.idref.fr/026950480/id,http://viaf.org/viaf/68931085,,,False


### 3.2/ Create/add to project persons

#### 3.2.1/ Add to project existing persons

Here we do not need to worry: we can simply add existing to project because `info_proj_rel` won't duplicate records.

So, since we will in this step only add the person to the project, it's fine.

In [43]:
selection = table[pd.notna(table['pk_person'])]

db.info_proj_rels.create(selection['pk_person'])

Creating info_proj_rel of 650 entities with project <153> ... Done in [00h00m00s]


#### 3.2.2/ Create new persons

In [44]:
selection = table[pd.isna(table['pk_person'])][['hls_id']].drop_duplicates().copy()

selection['pk_person'] = db.resources.create(pks.classes.person, len(selection))

table = table.merge(selection, on="hls_id", how='left')
table['pk_person'] = [row['pk_person_x'] if pd.notna(row['pk_person_x']) else row['pk_person_y'] for _, row in table.iterrows()]
table['pk_person'] = table['pk_person'].astype(int)
table.drop(columns=['pk_person_x', 'pk_person_y'], inplace=True)

a.infos(table, random=True)

[DB] Creating 23262 resources of class [21] ... Done in [00h00m04s]
Creating info_proj_rel of 23262 entities with project <153> ... Done in [00h00m08s]
Shape:  (23980, 13) - extract:


Unnamed: 0,hls_id,name,gender,definition,birthdate,deathdate,uri_wikidata,uri_hls,uri_idref,uri_viaf,existing_uris,already_in_project,pk_person
8576,42345,Walter Hauser,Male,American architect,"(1893, 1, 1)","(1959, 1, 1)",http://www.wikidata.org/entity/Q4017933,https://hls-dhs-dss.ch/articles/042345,http://www.idref.fr/084446412/id,http://viaf.org/viaf/80771507,,False,10323472
6202,7231,Antoine Saladin,Male,(1785-1865),"(1785, 10, 14)","(1865, 4, 4)",http://www.wikidata.org/entity/Q107645489,https://hls-dhs-dss.ch/articles/007231,,http://viaf.org/viaf/309601584,,False,10321185
6990,5606,Josef Anton Büchler,Male,(1891-1951),"(1891, 12, 22)","(1951, 12, 24)",http://www.wikidata.org/entity/Q96270042,https://hls-dhs-dss.ch/articles/005606,,http://viaf.org/viaf/309651290,,False,10321952
9073,12880,Berthold of Pfirt,Male,catholic bishop of Basel,"(1250, 1, 1)","(1262, 12, 15)",http://www.wikidata.org/entity/Q828141,https://hls-dhs-dss.ch/articles/012880,,,,False,10323953
12474,29538,Heinrich Remigius Sauerländer,Male,German publisher,"(1776, 12, 13)","(1847, 6, 2)",http://www.wikidata.org/entity/Q1598577,https://hls-dhs-dss.ch/articles/029538,http://www.idref.fr/08691717X/id,http://viaf.org/viaf/95372000,,False,10327267


### 3.3/ Add names to persons

Here we need to differentiate 2 cases: those who do not exists or are in other projects (easy, just add new names) and those who are in the project already, with names.

After looking at data, it seems that entities that are already in the project, already has the right names, so nothing to do with them.

So in the end, we just need to add names for those outside the project, or non existing persons.

In [45]:
selection = table[pd.notna(table['name'])]

# Filter out those already in the project
selection = selection[~selection['already_in_project']]

db.shortcuts.add_person_names(selection['pk_person'], selection['name'], pks.languages.english)

[DB] Creating 23490 resources of class [868] ... Done in [00h00m04s]
Creating info_proj_rel of 23490 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23490 appellations ... Done in [00h00m08s]
[DB] Creating 23490 statements ... Updating metadata ... Done in [00h00m19s]
Creating info_proj_rel of 23490 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23490 statements ... Updating metadata ... Done in [00h00m18s]
Creating info_proj_rel of 23490 entities with project <153> ... Done in [00h00m10s]
[DB] Creating 23490 statements ... Updating metadata ... Done in [00h00m18s]
Creating info_proj_rel of 23490 entities with project <153> ... Done in [00h00m09s]


### 3.4/ Add gender to persons

At first, we might think that we have the same problem as for names, but here, after looking at data, we see that no gender has been attributed to person for now. So we can just add them, independantly of if the person already exists, of if the person exists and is in the project

In [46]:
table['pk_gender'] = pd.NA
table['pk_gender'] = [pks.entities.pk_gender_male if pd.notna(row['gender']) and row['gender'] == 'Male' else row['pk_gender'] for _, row in table.iterrows()]
table['pk_gender'] = [pks.entities.pk_gender_female if pd.notna(row['gender']) and row['gender'] == 'Female' else row['pk_gender'] for _, row in table.iterrows()]

selection = table[pd.notna(table['pk_gender'])]

db.statements.create(selection['pk_person'], pks.properties.person_hasGender_gender, selection['pk_gender'])

[DB] Creating 23859 statements ... Updating metadata ... Done in [00h00m20s]
Creating info_proj_rel of 23859 entities with project <153> ... Done in [00h00m09s]


### 3.5/ Add definitions

Same explication than for gender

In [47]:
selection = table[pd.notna(table['definition'])]

db.shortcuts.add_definitions(selection['pk_person'], selection['definition'], pks.languages.english)

[DB] Creating 19482 resources of class [899] ... Done in [00h00m04s]
Creating info_proj_rel of 19482 entities with project <153> ... Done in [00h00m08s]
[DB] Creating 19482 appellations ... Done in [00h00m12s]
[DB] Creating 19482 statements ... Updating metadata ... Done in [00h00m17s]
Creating info_proj_rel of 19482 entities with project <153> ... Done in [00h00m08s]
[DB] Creating 19482 statements ... Updating metadata ... Done in [00h00m17s]
Creating info_proj_rel of 19482 entities with project <153> ... Done in [00h00m07s]
[DB] Creating 19482 statements ... Updating metadata ... Done in [00h00m19s]
Creating info_proj_rel of 19482 entities with project <153> ... Done in [00h00m08s]


### 3.6/ Add birth information

For entities that are in the project, we trust what is in the project

In [82]:
# table['birthdate'] = [u.parse_date(date_str) for date_str in table['birthdate']]

selection = table[pd.notna(table['birthdate'])]

# Filter out those already in the project
selection = selection[~selection['already_in_project']]

pk_births = db.resources.create(pks.classes.birth, len(selection))
pk_time_prim = db.time_primitives.create(selection['birthdate'])
db.statements.create(pk_births, pks.properties.birth_broughtIntoLife_person, selection['pk_person'])
db.statements.create(pk_births, pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, pk_time_prim)

[DB] Creating 21807 resources of class [61] ... Done in [00h00m04s]
[DB] Creating info_proj_rel of 21807 entities with project <153> ... Done in [00h00m08s]
[DB] Creating 21807 time primitives ... Done in [00h00m03s]
[DB] Creating 21807 statements ... Updating metadata ... Done in [00h00m18s]
[DB] Creating info_proj_rel of 21807 entities with project <153> ... Done in [00h00m07s]
[DB] Creating 21807 statements ... Updating metadata ... Done in [00h00m18s]
[DB] Creating info_proj_rel of 21807 entities with project <153> ... Done in [00h00m08s]


### 3.7/ Add death information

There is no death information in the project, we can add them all

In [54]:
# table['deathdate'] = [u.parse_date(date_str) for date_str in table['deathdate']]

selection = table[pd.notna(table['deathdate'])]

pk_deaths = db.resources.create(pks.classes.death, len(selection))
pk_time_prim = db.time_primitives.create(selection['deathdate'])
db.statements.create(pk_deaths, pks.properties.death_wasDeathOf_person, selection['pk_person'])
db.statements.create(pk_deaths, pks.properties.timeSpan_atSomeTimeWithin_timePrimitive, pk_time_prim)

[DB] Creating 21596 time primitives ... Done in [00h00m04s]
[DB] Creating 21596 statements ... Updating metadata ... Done in [00h00m20s]
Creating info_proj_rel of 21596 entities with project <153> ... Done in [00h00m08s]
[DB] Creating 21596 statements ... Updating metadata ... Done in [00h00m19s]
Creating info_proj_rel of 21596 entities with project <153> ... Done in [00h00m08s]


### 3.8/ Add URIs

#### HLS

In [55]:
selection = table[pd.notna(table['uri_hls'])]

db.shortcuts.add_uris(selection['pk_person'], selection['uri_hls'])

[DB] Creating 23980 resources of class [967] ... Done in [00h00m04s]
Creating info_proj_rel of 23980 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23980 appellations ... Done in [00h00m22s]
[DB] Creating 23980 statements ... Updating metadata ... Done in [00h00m21s]
Creating info_proj_rel of 23980 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23980 statements ... Updating metadata ... Done in [00h00m21s]
Creating info_proj_rel of 23980 entities with project <153> ... Done in [00h00m10s]


#### Wikidata

In [66]:
selection = table[pd.notna(table['uri_wikidata'])]
selection = selection[[pd.isna(row['existing_uris']) or row['uri_wikidata'] not in row['existing_uris'] for i, row in selection.iterrows()]]

db.shortcuts.add_uris(selection['pk_person'], selection['uri_wikidata'])

[DB] Creating 23492 resources of class [967] ... Done in [00h00m04s]
Creating info_proj_rel of 23492 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23492 appellations ... Done in [00h00m23s]
[DB] Creating 23492 statements ... Updating metadata ... Done in [00h00m23s]
Creating info_proj_rel of 23492 entities with project <153> ... Done in [00h00m09s]
[DB] Creating 23492 statements ... Updating metadata ... Done in [00h00m19s]
Creating info_proj_rel of 23492 entities with project <153> ... Done in [00h00m10s]


#### idref

In [68]:
selection = table[pd.notna(table['uri_idref'])]
selection = selection[[pd.isna(row['existing_uris']) or row['uri_idref'] not in row['existing_uris'] for i, row in selection.iterrows()]]

db.shortcuts.add_uris(selection['pk_person'], selection['uri_idref'])

[DB] Creating 6178 resources of class [967] ... Done in [00h00m02s]
Creating info_proj_rel of 6178 entities with project <153> ... Done in [00h00m02s]
[DB] Creating 6178 appellations ... Done in [00h00m05s]
[DB] Creating 6178 statements ... Updating metadata ... Done in [00h00m06s]
Creating info_proj_rel of 6178 entities with project <153> ... Done in [00h00m02s]
[DB] Creating 6178 statements ... Updating metadata ... Done in [00h00m06s]
Creating info_proj_rel of 6178 entities with project <153> ... Done in [00h00m03s]


#### viaf

In [70]:
selection = table[pd.notna(table['uri_viaf'])]
selection = selection[[pd.isna(row['existing_uris']) or row['uri_viaf'] not in row['existing_uris'] for i, row in selection.iterrows()]]

db.shortcuts.add_uris(selection['pk_person'], selection['uri_viaf'])

[DB] Creating 17943 resources of class [967] ... Done in [00h00m03s]
Creating info_proj_rel of 17943 entities with project <153> ... Done in [00h00m07s]
[DB] Creating 17943 appellations ... Done in [00h00m12s]
[DB] Creating 17943 statements ... Updating metadata ... Done in [00h00m16s]
Creating info_proj_rel of 17943 entities with project <153> ... Done in [00h00m06s]
[DB] Creating 17943 statements ... Updating metadata ... Done in [00h00m15s]
Creating info_proj_rel of 17943 entities with project <153> ... Done in [00h00m07s]


In [60]:
table_save.to_csv('temp.csv')

In [83]:
table

Unnamed: 0,hls_id,name,gender,definition,deathdate,uri_wikidata,uri_hls,uri_idref,uri_viaf,existing_uris,already_in_project,pk_person,pk_gender,birthdate
0,028377,Anatoly Lunacharsky,Male,Russian Marxist revolutionary (1875-1933),"(1933, 12, 26)",http://www.wikidata.org/entity/Q18809,https://hls-dhs-dss.ch/articles/028377,http://www.idref.fr/035498730/id,http://viaf.org/viaf/74003029,,False,10315216,739340,"(1875, 11, 23)"
1,044295,Louis de Rougemont,Male,explorer with false claims (1847–1921),"(1921, 6, 9)",http://www.wikidata.org/entity/Q20014,https://hls-dhs-dss.ch/articles/044295,http://www.idref.fr/243997825/id,http://viaf.org/viaf/62631835,,False,10315217,739340,"(1847, 11, 12)"
2,028424,Élisée Reclus,Male,French geographer and writer,"(1905, 7, 4)",http://www.wikidata.org/entity/Q20951,https://hls-dhs-dss.ch/articles/028424,http://www.idref.fr/027092321/id,http://viaf.org/viaf/71396743,,False,10315218,739340,"(1830, 3, 15)"
3,011946,Hermann Hesse,Male,German writer (1877–1962),"(1962, 8, 9)",http://www.wikidata.org/entity/Q25973,https://hls-dhs-dss.ch/articles/011946,http://www.idref.fr/028345614/id,http://viaf.org/viaf/41841418,,False,10315219,739340,"(1877, 7, 2)"
4,012028,Paul Klee,Male,Swiss artist (1879-1940),"(1940, 6, 29)",http://www.wikidata.org/entity/Q44007,https://hls-dhs-dss.ch/articles/012028,http://www.idref.fr/026950480/id,http://viaf.org/viaf/68931085,,False,10315220,739340,"(1879, 12, 18)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24123,032730,Josef Erni,Male,Swiss politician (1827-1907),"(1907, 1, 13)",http://www.wikidata.org/entity/Q99305370,https://hls-dhs-dss.ch/articles/032730,,http://viaf.org/viaf/308709800,,False,10338473,739340,"(1827, 11, 18)"
24124,032824,Albert Morel,Male,Swiss politician (1828-1889),"(1889, 12, 23)",http://www.wikidata.org/entity/Q99305378,https://hls-dhs-dss.ch/articles/032824,,http://viaf.org/viaf/309586961,,False,10338474,739340,"(1828, 5, 24)"
24125,023377,Jean Louis Cannac d'Hauteville,Male,Genevan-French military officer (1740-1815),"(1815, 5, 29)",http://www.wikidata.org/entity/Q107000474,https://hls-dhs-dss.ch/articles/023377,,http://viaf.org/viaf/317092095,,False,10338475,739340,"(1740, 10, 20)"
24126,017533,Felix Brunner,Male,,"(1805, 9, 1)",http://www.wikidata.org/entity/Q107000418,https://hls-dhs-dss.ch/articles/017533,,,,False,10338476,739340,"(1729, 2, 9)"


In [85]:
table[table['already_in_project'] == True].iloc[7]['existing_uris']

'http://data.bnf.fr/ark:/12148/cb122991692#about http://dbpedia.org/resource/Johann_Bernoulli http://d-nb.info/gnd/118509969 http://d-nb.info/gnd/126044759 http://symogih.org/resource/Actr52755 http://viaf.org/viaf/51754496 http://wikidata.org/entity/Q227897 http://www.idref.fr/03185415X/id http://www.wikidata.org/entity/Q227897'