- Vérifier les matchings des geo places
- Vérifier l'algo sur les types des geo places
- Geo type "Département of France"?
- Vérifier l'ordre de gestion des types (['AdministrativeRegion', 'ArchitecturalStructure', 'Building',
       'Castle', 'City', 'Country', 'Czech_lands',
       'Departments_of_France', 'Gemeinde', 'Imperial city', 'Island',
       'Location', 'Mountain', 'MountainRange', 'MusicalArtist',
       'NaturalPlace', 'Organisation', 'Person', 'Place',
       'PopulatedPlace', 'Region', 'Settlement', 'SoccerClub', 'Stadt',
       'Town', 'Village', 'arr', 'line'])
- Mettre à jour les places géographiques existantes (manuellement?)

# Existing births should have a place

For more info, see the related ticket: https://github.com/geovistory/switzerland-and-beyond/issues/1

In [1]:
env = 'staging'
pk_project = 153
execute = False

import pandas as pd

import geovpylib.database as db
import geovpylib.utils as u
import geovpylib.sparql as sparql
import geovpylib.pks as pks
import geovpylib.graphs as graphs
import geovpylib.find as find

db.connect(env, pk_project, execute)

Requests will not be executed
=== Setting STAGING environment ===
>> Connecting to PGSQL Database ... Connected!


## Fetch data

Those data correspond to information about persons that already exists on Geovistory. The goal is to enrich them with adding a birth place (sometimes with geo coordinates), and URIs.

In [2]:
sparql.init(f"https://sparql.geovistory.org/api_v1_project_{pk_project}")

data = sparql.query("""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX ontome: <https://ontome.net/ontology/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>


    SELECT 
        ?pk_person ?dbpedia_person_uri ?wikidata_person_uri ?pk_birth ?birthPlace ?birthPlace_type1 ?birthPlace_type2 ?lat ?lng
    WHERE {
        ?pk_person a ontome:c21 .   
        ?pk_person owl:sameAs ?dbpedia_person_uri .
        ?pk_birth ontome:p86 ?pk_person
        { 
            SERVICE <https://dbpedia.org/sparql/> { 
                ?dbpedia_person_uri dbo:birthPlace ?birthPlace .  
                optional {?birthPlace dbp:type ?birthPlace_type1 .}
                optional {?birthPlace rdf:type ?birthPlace_type2 .}
                ?birthPlace geo:lat ?lat .
                ?birthPlace geo:long ?lng .
      			?dbpedia_person_uri owl:sameAs ?wikidata_person_uri .
                FILTER (CONTAINS(STR(?birthPlace_type2), 'http://dbpedia.org/ontology/'))
    			FILTER (CONTAINS(STR(?wikidata_person_uri), 'wikidata'))
            } 

        }
    }
""")
                    
# # For some columns, remove the URI part, just keep the last part
data['pk_person'] = [text[text.rindex('/') + 2:] for text in data['pk_person']]
data['birthPlace'] = [text[text.rindex('/') + 1:] for text in data['birthPlace']]
data['birthPlace_type1'] = [text[text.rindex('/') + 1:] if pd.notna(text) and '/' in text else text for text in data['birthPlace_type1']]
data['birthPlace_type2'] = [text[text.rindex('/') + 1:] if pd.notna(text) and '/' in text else text for text in data['birthPlace_type2']]
data['pk_birth'] = [text[text.rindex('/') + 2:] if pd.notna(text) and '/' in text else text for text in data['pk_birth']]

# Aggregate birth place types
birthplaces_t1 = data.groupby(['pk_person', 'birthPlace'])['birthPlace_type1'].apply(lambda x: ', '.join(x.dropna().drop_duplicates())).reset_index()
birthplaces_t2 = data.groupby(['pk_person', 'birthPlace'])['birthPlace_type2'].apply(lambda x: ', '.join(x.dropna().drop_duplicates())).reset_index()
birthplaces_types = birthplaces_t1.merge(birthplaces_t2, on=['pk_person', 'birthPlace'])
birthplaces_types['birthplace_types'] =  birthplaces_types['birthPlace_type1'] + ', ' + birthplaces_types['birthPlace_type2']
birthplaces_types['birthplace_types'] = [text[2:] if text.startswith(', ') else text for text in birthplaces_types['birthplace_types']]

# Merge the types
data = data.merge(birthplaces_types, on=['pk_person', 'birthPlace'])

# Select only the column we are interested in
data = data[['pk_person', 'dbpedia_person_uri', 'wikidata_person_uri', 'pk_birth', 'birthPlace', 'birthplace_types', 'lat', 'lng']].drop_duplicates()

# Drop duplicates to only have one record for each [pk_person, birthPlace]
data.drop_duplicates(inplace=True)

u.infos(data, random=True)

Shape:  (1799, 8) - extract:


Unnamed: 0,pk_person,dbpedia_person_uri,wikidata_person_uri,pk_birth,birthPlace,birthplace_types,lat,lng
3835,26268,http://dbpedia.org/resource/Claude_Bouhier_de_...,http://www.wikidata.org/entity/Q1096539,68364,Dijon,"Location, City, Place, Settlement, PopulatedPlace",47.3167,5.01667
6419,27368,http://dbpedia.org/resource/Grete_Kellenberger...,http://www.wikidata.org/entity/Q21634516,68432,Zürich,"City, Location, PopulatedPlace, Place, Settlement",47.3744,8.54111
1288,26081,http://dbpedia.org/resource/Toni_Wiedemann,http://www.wikidata.org/entity/Q595416,69102,German_Empire,"Location, Country, Place, PopulatedPlace",52.5167,13.4
207,26229,http://dbpedia.org/resource/Jane_Frances_de_Ch...,http://www.wikidata.org/entity/Q234521,68372,France,"Country, Location, Place, PopulatedPlace",48.85,2.0
998,27132,http://dbpedia.org/resource/Rudolf_Wolf,http://www.wikidata.org/entity/Q115675,68448,Fällanden,"Location, Settlement, Place, PopulatedPlace",47.3667,8.63333


## Get information about geographical places

### List them

In [3]:
geo_places = data[['birthPlace', 'birthplace_types', 'lat', 'lng']]
geo_places.columns = ['name', 'types', 'lat', 'lng']

# We aggregate all types available for a geo place name
geo_places_types = geo_places.groupby('name')['types'].apply(lambda x: ','.join(sorted(list(dict.fromkeys((', '.join(x).split(', '))))))).reset_index()

# We can now go back to initial geo_place date, and fill with aggregated types
geo_places = geo_places.merge(geo_places_types, on='name').drop(columns=['types_x']).rename(columns={'types_y': 'types'})

# We want only one row for each geo place
geo_places.drop_duplicates(inplace=True)

# We make sure that we only have one row for each place
unicity = len(geo_places) == len(geo_places['name'].unique())
print(f'Unicity: {unicity}, shape with duplicates: {geo_places.shape}')

if not unicity:
    gb = geo_places.groupby('name').count().reset_index()
    names = gb[gb['types'] != 1]['name'].tolist()
    print('Remaining duplicates:')
    display(geo_places[geo_places['name'].isin(names)])

Unicity: False, shape with duplicates: (486, 4)
Remaining duplicates:


Unnamed: 0,name,lat,lng,types
424,Italy,41.9,12.4833,"Country,Location,Person,Place,PopulatedPlace"
425,Italy,41.9,12.0,"Country,Location,Person,Place,PopulatedPlace"
426,Italy,43.0,12.4833,"Country,Location,Person,Place,PopulatedPlace"
427,Italy,43.0,12.0,"Country,Location,Person,Place,PopulatedPlace"
655,France,47.0,2.35,"Country,Location,Place,PopulatedPlace"
656,France,47.0,2.0,"Country,Location,Place,PopulatedPlace"
657,France,48.85,2.35,"Country,Location,Place,PopulatedPlace"
658,France,48.85,2.0,"Country,Location,Place,PopulatedPlace"
1360,Austria,48.2,13.3333,"Country,Location,Person,Place,PopulatedPlace"
1361,Austria,48.2,16.35,"Country,Location,Person,Place,PopulatedPlace"


Now that filter has been done, we observe that there are duplicates because some geographical places have multiple geo coordinates couples. 

So we just arbitrarily take one.

In [4]:
geo_places.drop_duplicates(subset=['name'], inplace=True)

u.infos(geo_places)

Shape:  (474, 4) - extract:


Unnamed: 0,name,lat,lng,types
0,Geneva,46.2017,6.14694,"Location,Place,PopulatedPlace,Settlement"
80,German_Empire,52.5167,13.4,"Country,Location,Place,PopulatedPlace"
98,Mulhouse,47.75,7.34,"City,Location,Place,PopulatedPlace,Settlement"
112,Brescia,45.5417,10.2167,"City,Location,Place,PopulatedPlace,Settlement"
116,Corteno_Golgi,46.1669,10.2444,"Location,Place,PopulatedPlace,Settlement,Village"


### Record linkage with existing geographical places inside Geovistory

In [5]:
# Prepare data
geo_places.reset_index(inplace=True)
geo_places['name'] = geo_places['name'].str.lower()
geo_places['name'] = geo_places['name'].str.replace('_', ' ')
geo_places['lat'] = geo_places['lat'].astype(float)
geo_places['lng'] = geo_places['lng'].astype(float)

# Record linkage
matches = find.find_geoplaces(geo_places, 'index')

# drop created columns
geo_places.drop(columns=['index'], inplace=True)

matches

Checking data integrity... Done
Find all geographical places in Geovistory... 14824 found.
Finding similar geographical places is done - Elapsed: [00h00'00]                   


Unnamed: 0,index,pk_gv,new_name,gv_name,new_lat,gv_lat,new_lng,gv_lng,distance
12,112,300222,brescia,brescia,45.5417,45.541553,10.2167,10.211802,0.4
11,112,300222,brescia,brescia,45.5417,45.538889,10.2167,10.220278,0.4
17,118,3184085,switzerland,switzerland,46.95,,7.45,,
18,414,1739219,varese,varese,45.8167,45.816667,8.83333,8.833333,0.0
6,501,80974,basel,basel,47.5547,47.5584,7.59056,7.5733,1.4
7,501,80974,basel,basel,47.5547,47.55814,7.59056,7.58769,0.4
19,646,3150818,niederweningen,niederweningen,47.5,47.51088,8.38333,8.406932,2.1
8,889,80681,bern,bern,46.9481,46.94809,7.4475,7.44744,0.0
9,918,25494,zürich,zürich,47.3744,47.366667,8.54111,8.55,1.1
20,918,1739255,zürich,zürich,47.3744,,8.54111,,


Reading through the previous table, we see that the matches are likely to be correct. 

So we assume they are.

In [None]:
matches = matches[['new_name', 'pk_entity']].drop_duplicates()
matches.columns = ['name', 'pk_entity']

u.infos(matches)

In [None]:
geo_places = geo_places.merge(matches, on='name', how='left')
geo_places['pk_entity'] = geo_places['pk_entity'].astype(pd.Int64Dtype())

Now, already existing places in Geovistory can be found because in the table, they have a `pk_entity` value.

In [None]:
u.infos(geo_places)

## Build the information we want to import about Geographical places

In [None]:
# DBpedia URI
geo_places['dbpedia_uri'] = 'http://dbpedia.org/resource/' + geo_places['name']

# Type
for i, place in geo_places.iterrows():
    if 'Country' in place['types']: geo_places.at[i, 'pk_type'] = pks.entities.pk_geo_place_country
    elif 'City' in place['types']: geo_places.at[i, 'pk_type'] = pks.entities.pk_geo_place_city
    elif 'Village' in place['types']: geo_places.at[i, 'pk_type'] = pks.entities.pk_geo_place_village
    elif 'Town' in place['types']: geo_places.at[i, 'pk_type'] = pks.entities.pk_geo_place_town
    elif 'Departments_of_France' in place['types']: geo_places.at[i, 'pk_type'] = pd.NA
    else: geo_places.at[i, 'pk_type'] = pd.NA
geo_places.drop(columns=['types'], inplace=True)


geo_places = geo_places[['pk_entity', 'name', 'dbpedia_uri', 'pk_type', 'lat', 'lng']]
u.infos(geo_places)

In [None]:
to_create = geo_places[pd.isna(geo_places['pk_entity'])].copy()
to_update = geo_places[pd.notna(geo_places['pk_entity'])].copy()

## Create new geographical places

In [None]:
# Create the entity
to_create['pk_entity'] = db.resources.create(pks.classes.geoPlace, len(to_create))

In [None]:
# Add names
selection = to_create[pd.notna(to_create['name'])]

graphs.add_names(
    selection['pk_entity'].tolist(),
    selection['name'].tolist(),
    pks.languages.english
)

In [None]:
# Add URIs
selection = pd.notna(to_create['dbpedia_uri'])

graphs.add_uris(
    selection['pk_entity'].tolist(),
    selection['dbpedia_uri'].tolist(),
)

In [None]:
# Create presences
selection = to_create[pd.notna(to_create['lat'])]
selection = selection[pd.notna(selection['lng'])]

graphs.add_geo_coordinates(
    selection['pk_entity'].tolist(),
    selection['lat'].astype(float).tolist(),
    lngs = selection['lng'].astype(float).tolist()
)

In [None]:
# Add types
selection = to_create[pd.notna(to_create['pk_type'])]

db.statements.create(
    selection['pk_entity'].tolist(),
    pks.properties.geoPlace_hasIdentifyingGeoPlaceType_geoPlaceType,
    selection['pk_type'].tolist()
)

In [None]:
# Update data so that it has the created pk_entity
to_create.rename(columns={'pk_entity':'pk_entity_created'}, inplace=True)
geo_places.merge(to_create[['pk_entity_created', 'name']], on='name')
geo_places['pk_entity'] = [row['pk_entity'] if pd.notna(row['pk_entity']) else row['pk_entity_created'] for _, row in geo_places.iterrows()]
geo_places.drop(columns=['pk_entity_created'], inplace=True)

## Update existing geographical places (manually?)

In [None]:
to_update

## Add data to persons (to their births)

In [None]:
# Prepare data
data['birthPlace'] = data['birthPlace'].str.lower()
data.drop(columns=['birthplace_types', 'lat', 'lng'], inplace=True)
geo_places = geo_places[['name', 'pk_entity']].rename(columns={'pk_entity':'pk_geo_place', 'name':'birthPlace'})

data = data.merge(geo_places, on='birthPlace', how='inner').drop(columns=['birthPlace'])
u.infos(data, random=True)

In [None]:
### Add DBpedia uris
selection = data[['pk_person', 'dbpedia_person_uri']].dropna()

graphs.add_uris(
    selection['pk_person'].tolist(), 
    selection['dbpedia_person_uri'].tolist()
)

In [None]:
### Add Wikidata uris
selection = data[['pk_person', 'wikidata_person_uri']].dropna()

graphs.add_uris(
    selection['pk_person'].tolist(), 
    selection['wikidata_person_uri'].tolist()
)

In [None]:
### Add birth place to births
selection = data[['pk_birth', 'pk_birth_place']].dropna()

db.statements.create(
    selection['pk_birth'].tolist(),
    pks.properties.period_tookPlaceOnOrWithin_phyThing,
    selection['pk_birth_place']
)