# Import HLS Groups

Here we are going to look at how groups are links to HLS persons on dbpedia, we are going to fetch those groups, link them with URIs, make a record linkage, and import them into Geovistory.

So first, we need to find properties on wikidata that describes "this person was in this group".

Here is the list that has been manually found:
- https://www.wikidata.org/wiki/Property:P39 (Position held) => https://www.wikidata.org/wiki/Q4164871 (Position) => https://www.wikidata.org/wiki/Property:P361 (Part of) => https://www.wikidata.org/wiki/Q43229 (Organization)

- https://www.wikidata.org/wiki/Property:P108 (Employer) => https://www.wikidata.org/wiki/Q43229 (Organization)

- https://www.wikidata.org/wiki/Property:P69 (Educated at) => https://www.wikidata.org/wiki/Q43229 (Organization)

- https://www.wikidata.org/wiki/Property:P463 (Member of) => https://www.wikidata.org/wiki/Q43229 (Organization)


In [4]:
# %load /home/gaetan/Desktop/geovpylib/templates/heading.py
%load_ext autoreload
%autoreload 2

# Common imports
import os
import pandas as pd, numpy as np
#import datetime
#import time
#import json
#import requests
#import duckdb
#import plotly.express as px
# from multiprocessing import Pool

# Geovpylib library
import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.decorators as d
import geovpylib.magics
import geovpylib.pks as pks
import geovpylib.queries as q
import geovpylib.record_linkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u
eta = u.Eta()

# Specific imports
# ...

# Global variables
# ...

# Connect to Geovistory database read mode
# db.connect_geovistory('prod')

# Connect to Geovistory database for insert
# env = 'prod' # Database to query: "prod", "stag", "dev", "local"
# pk_project = pks.projects. # The project to query/insert: integer
# execute = False # Boolean to prevent to execute directly into databases
# metadata_str = '' # kebab-lower-case or snake-lower-case. 
# import_manner = 'one-shot' # 'one-shot' or 'batch'
# db.connect_geovistory(env, pk_project, execute)
# db.set_metadata({'import-id': datetime.datetime.today().strftime('%Y%m%d') + '-' + metadata_str})
# db.set_insert_manner(import_manner)

# Connect to other database
# db_url_env_var_name = 'YELLOW-' # Name of an environment variable holding the Postgres database URL
# execute = False # Boolean to prevent to execute directly into databases
# db.connect_external(os.getenv(db_url_env_var_name), execute=False)

# Connect to a SPARQL endpoint
sparql.connect_external(os.getenv('SPARQL_WIKIDATA_URL'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
>> External SPARQL URL set to <https://query.wikidata.org/sparql>


# 1. Find information available for HLS persons on wikidata

In [5]:
hls_persons = sparql.query("""
    SELECT ?uri_person ?uri_personLabel ?uri_employer ?uri_employerLabel ?uri_institution ?uri_institutionLabel ?uri_organization ?uri_organizationLabel ?uri_organization2 ?uri_organization2Label
    WHERE {
        ?uri_person wdt:P31 wd:Q5 .
        ?uri_person wdt:P902 ?hls_id .
                           
        optional {
            ?uri_person wdt:P108 ?uri_employer .
            ?uri_employer wdt:P31 wd:Q43229 .
        }
                       
        optional {           
            ?uri_person wdt:P69 ?uri_institution .
            ?uri_institution wdt:P31 wd:Q43229 .
        }
                       
        optional {           
            ?uri_person wdt:P463 ?uri_organization .
            ?uri_organization wdt:P31 wd:Q43229 .
        }

        optional {
            ?uri_person wdt:P39 ?position .
            ?position wdt:P361 ?uri_organization2 .
            ?uri_organization2 wdt:P31 wd:Q43229 .
        }

        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
""")

hls_persons = hls_persons[['uri_person', 'uri_personLabel', 'uri_employer', 'uri_employerLabel', 'uri_institution', 'uri_institutionLabel', 'uri_organization', 'uri_organizationLabel']]
hls_persons.rename(columns={'uri_employerLabel': 'employer', 'uri_institutionLabel':'institution', 'uri_organizationLabel':'organization'}, inplace=True)
a.infos(hls_persons, random=True)

Shape:  (25377, 8) - extract:


Unnamed: 0,uri_person,uri_personLabel,uri_employer,employer,uri_institution,institution,uri_organization,organization
10736,http://www.wikidata.org/entity/Q3188607,Jules Gaudard,,,,,,
8147,http://www.wikidata.org/entity/Q1359332,Ernst Ludwig Lichtenhahn,,,,,,
2382,http://www.wikidata.org/entity/Q3372498,Paul de Pasquier de Franclieu,,,,,,
5103,http://www.wikidata.org/entity/Q99305394,Hans Hilfiker,,,,,,
8800,http://www.wikidata.org/entity/Q1559912,Götz Schultheiss unter dem Schopf,,,,,,


In [8]:
person_group_stmt = len(hls_persons.dropna(subset='uri_employer')) + len(hls_persons.dropna(subset='uri_institution')) + len(hls_persons.dropna(subset='uri_organization'))
print('Number of person <-> group statements:', person_group_stmt)

Number of person <-> group statements: 235


# 2. Extract groups information

In [9]:
groups = pd.concat([
    hls_persons[['uri_employer', 'employer']].dropna().rename(columns={'uri_employer':'uri','employer':'name'}).drop_duplicates(),
    hls_persons[['uri_institution', 'institution']].dropna().rename(columns={'uri_institution':'uri','institution':'name'}).drop_duplicates(),
    hls_persons[['uri_organization', 'organization']].dropna().rename(columns={'uri_organization':'uri','organization':'name'}).drop_duplicates()
]).drop_duplicates().reset_index(drop=True)

a.infos(groups)

Shape:  (92, 2) - extract:


Unnamed: 0,uri,name
0,http://www.wikidata.org/entity/Q672791,Pro Juventute
1,http://www.wikidata.org/entity/Q11524928,Q11524928
2,http://www.wikidata.org/entity/Q11524602,Tokyo Chamber of Commerce and Industry
3,http://www.wikidata.org/entity/Q994171,Imperial Theatre
4,http://www.wikidata.org/entity/Q477269,Schweizer Spende
