In [1]:
%load_ext autoreload
%autoreload 2

env = 'staging'
pk_project = 153
execute = False

import pandas as pd
import numpy as np
import duckdb

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# Fetch all geographical places

## Existing persons birth & death places

From all the person already in the project, we fetch the information in dbpedia in order to have birth places and death places. 

We can have thoes information, because in Geovistory, there already is the dbpedia URI.

In [2]:
sparql.connect_geovistory(pk_project)

>> SPARQL endpoint of project 153 set.


In [3]:
data_existing = pd.concat([
    sparql.query("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX ontome: <https://ontome.net/ontology/>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbp: <http://dbpedia.org/property/>
        PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

        SELECT DISTINCT
            ?place ?type ?lat ?lng
        WHERE {
            ?pk_person a ontome:c21 .   
            ?pk_person owl:sameAs ?dbpedia_person_uri .
            { 
                SERVICE <https://dbpedia.org/sparql/> { 
                    ?dbpedia_person_uri dbo:birthPlace ?place .  
                    optional {?place rdf:type ?type .}
                    ?place geo:lat ?lat .
                    ?place geo:long ?lng .
                    FILTER (CONTAINS(STR(?type2), 'http://dbpedia.org/ontology/'))
                } 
            }
        }
    """), 
    sparql.query("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX ontome: <https://ontome.net/ontology/>
        PREFIX dbo: <http://dbpedia.org/ontology/>
        PREFIX dbp: <http://dbpedia.org/property/>
        PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

        SELECT DISTINCT
            ?place ?type ?lat ?lng
        WHERE {
            ?pk_person a ontome:c21 .   
            ?pk_person owl:sameAs ?dbpedia_person_uri .
            { 
                SERVICE <https://dbpedia.org/sparql/> { 
                    ?dbpedia_person_uri dbo:deathPlace ?place .  
                    optional {?place rdf:type ?type .}
                    ?place geo:lat ?lat .
                    ?place geo:long ?lng .
                    FILTER (CONTAINS(STR(?type), 'http://dbpedia.org/ontology/'))
                } 
            }
        }
    """)
])

# Because we might have duplicated places that have been birth places and death places 
data_existing.drop_duplicates(inplace=True)

# Handle types
data_existing['type'] = data_existing['type'].str.replace('http://dbpedia.org/ontology/', '', regex=False)
types = data_existing.groupby('place')['type'].apply(lambda x: ', '.join(x.dropna().drop_duplicates())).reset_index().rename(columns={'type': 'types'})
data_existing['types'] = data_existing['types'].str.lower()
data_existing = data_existing.merge(types, on='place').drop(columns=['type'])
data_existing.drop_duplicates(inplace=True)

# Get the name
data_existing['name'] = data_existing['place'].str.replace('http://dbpedia.org/resource/', '', regex=False).str.replace('_', ' ')

# Rename columns
data_existing.rename(columns={'place':'uri'}, inplace=True)

# Column reordering 
data_existing = data_existing[['name', 'lat', 'lng', 'types', 'uri']]

a.infos(data_existing)

KeyError: 'types'

### HLS birth and death places

In [None]:
sparql.connect_external('https://query.wikidata.org/sparql')

birthplaces = sparql.query("""
    SELECT ?place ?placeLabel ?typeLabel ?lat ?lng
    WHERE {
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P19 ?place .
        ?place wdt:P31 ?type .
        ?place p:P625 ?coords .
        ?coords psv:P625 ?coordinate_node.
        ?coordinate_node wikibase:geoLatitude ?lat .
        ?coordinate_node wikibase:geoLongitude ?lng .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,fr,it". }
    }
""")

deathplaces = sparql.query("""
    SELECT ?place ?placeLabel ?typeLabel ?lat ?lng
    WHERE {
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P20 ?place .
        ?place wdt:P31 ?type .
        ?place p:P625 ?coords .
        ?coords psv:P625 ?coordinate_node.
        ?coordinate_node wikibase:geoLatitude ?lat .
        ?coordinate_node wikibase:geoLongitude ?lng .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,fr,it". }
    }
""")

# Merge birth places and deah places                           
data_hls = pd.concat([birthplaces, deathplaces])
data_hls.drop_duplicates(inplace=True)

# Rename columns
data_hls.columns = ['uri', 'lat', 'lng', 'name', 'type']

# Aggregate types
data_hls['types'] = data_hls.groupby('uri')['type'].transform(lambda x: ', '.join(x))
data_hls['types'] = data_hls['types'].str.lower()
data_hls.drop(columns=['type'], inplace=True)
data_hls.drop_duplicates(subset=['uri'], inplace=True)

# Reorder columns
data_hls = data_hls[['name', 'uri', 'lat', 'lng', 'types']]
data_hls = u.parse_df(data_hls)

a.infos(data_hls)

### Merge both dataset

In [None]:
data = pd.concat([data_existing, data_hls])

a.infos(data, random=True)

---

In [None]:
# Type analysis
types_analysis = []

all_types = np.unique(', '.join(data['types'].tolist()).split(', '))
total_nb = len(data)
for type in all_types:
    nb = data['types'].str.contains(type).sum()
    types_analysis.append({'type': type, 'nb': nb, 'percent': u.percent(nb / total_nb)})

types_analysis = pd.DataFrame(data=types_analysis).sort_values('nb')
types_analysis

Hypothesis:
When those words are present in the types, we take the mentioned geo place type. We also take the priority order as it is displayed:
- When 'Village' then `pk_geo_place_type = 732859`
- When 'Town' then `pk_geo_place_type = 80412`
- When 'City' then `pk_geo_place_type = 80426`
- When 'Country' then `pk_geo_place_type = 919118`
- When 'Region' then `pk_geo_place_type = 3236783`

In [None]:
def get_type(types):
    if 'Village' in types: return 'Village'
    if 'Town' in types: return 'Town'
    if 'City' in types: return 'City'
    if 'Country' in types: return 'Country'
    if 'Region' in types: return 'Region'
    return pd.NA

def get_pk_type(types):
    if 'Village' in types: return 732859
    if 'Town' in types: return 80412
    if 'City' in types: return 80426
    if 'Country' in types: return 919118
    if 'Region' in types: return 3236783


data['type'] = [get_type(types) for types in data['types']]
data['pk_type'] = [get_pk_type(types) for types in data['types']]
data['pk_type'] = data['pk_type'].astype(pd.Int64Dtype())

a.infos(data)

There are still Geo place that could have been parsed:

In [None]:
data[pd.isna(data['type'])]

In [None]:
data = sparql.query("""
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX ontome: <https://ontome.net/ontology/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>
    PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>

    SELECT 
        ?pk_person ?dbpedia_person_uri ?wikidata_person_uri ?pk_birth ?birthPlace ?birthPlace_type1 ?birthPlace_type2 ?lat ?lng
    WHERE {
        ?pk_person a ontome:c21 .   
        ?pk_person owl:sameAs ?dbpedia_person_uri .
        ?pk_birth ontome:p86 ?pk_person
        { 
            SERVICE <https://dbpedia.org/sparql/> { 
                ?dbpedia_person_uri dbo:birthPlace ?birthPlace .  
                optional {?birthPlace dbp:type ?birthPlace_type1 .}
                optional {?birthPlace rdf:type ?birthPlace_type2 .}
                ?birthPlace geo:lat ?lat .
                ?birthPlace geo:long ?lng .
      			?dbpedia_person_uri owl:sameAs ?wikidata_person_uri .
                FILTER (CONTAINS(STR(?birthPlace_type2), 'http://dbpedia.org/ontology/'))
    			FILTER (CONTAINS(STR(?wikidata_person_uri), 'wikidata'))
            } 

        }
    }
""")
                    
# For some columns, remove the URI part, just keep the last part
data['pk_person'] = [text[text.rindex('/') + 2:] for text in data['pk_person']]
data['birthPlace'] = [text[text.rindex('/') + 1:] for text in data['birthPlace']]
data['birthPlace_type1'] = [text[text.rindex('/') + 1:] if pd.notna(text) and '/' in text else text for text in data['birthPlace_type1']]
data['birthPlace_type2'] = [text[text.rindex('/') + 1:] if pd.notna(text) and '/' in text else text for text in data['birthPlace_type2']]
data['pk_birth'] = [text[text.rindex('/') + 2:] if pd.notna(text) and '/' in text else text for text in data['pk_birth']]

# Aggregate birth place types
birthplaces_t1 = data.groupby(['pk_person', 'birthPlace'])['birthPlace_type1'].apply(lambda x: ', '.join(x.dropna().drop_duplicates())).reset_index()
birthplaces_t2 = data.groupby(['pk_person', 'birthPlace'])['birthPlace_type2'].apply(lambda x: ', '.join(x.dropna().drop_duplicates())).reset_index()
birthplaces_types = birthplaces_t1.merge(birthplaces_t2, on=['pk_person', 'birthPlace'])
birthplaces_types['birthplace_types'] =  birthplaces_types['birthPlace_type1'] + ', ' + birthplaces_types['birthPlace_type2']
birthplaces_types['birthplace_types'] = [text[2:] if text.startswith(', ') else text for text in birthplaces_types['birthplace_types']]

# Merge the types
data = data.merge(birthplaces_types, on=['pk_person', 'birthPlace'])

# Select only the column we are interested in
data = data[['pk_person', 'dbpedia_person_uri', 'wikidata_person_uri', 'pk_birth', 'birthPlace', 'birthplace_types', 'lat', 'lng']].drop_duplicates()

# Drop duplicates to only have one record for each [pk_person, birthPlace]
data.drop_duplicates(inplace=True)

a.infos(data, random=True)

In [None]:

a.infos(data, random=True)

## Parse geographical places

In [None]:
geo_places = data[['birthPlace', 'birthplace_types', 'lat', 'lng']]
geo_places.columns = ['name', 'types', 'lat', 'lng']

# We aggregate all types available for a geo place name
geo_places_types = geo_places.groupby('name')['types'].apply(lambda x: ','.join(sorted(list(dict.fromkeys((', '.join(x).split(', '))))))).reset_index()

# We can now go back to initial geo_place date, and fill with aggregated types
geo_places = geo_places.merge(geo_places_types, on='name').drop(columns=['types_x']).rename(columns={'types_y': 'types'})

# We want only one row for each geo place
geo_places.drop_duplicates(inplace=True)

# We make sure that we only have one row for each place
unicity = len(geo_places) == len(geo_places['name'].unique())
print(f'Unicity: {unicity}, shape with duplicates: {geo_places.shape}')

if not unicity:
    gb = geo_places.groupby('name').count().reset_index()
    names = gb[gb['types'] != 1]['name'].tolist()
    print('Remaining duplicates:')
    display(geo_places[geo_places['name'].isin(names)])

Now that filter has been done, we observe that there are duplicates because some geographical places have multiple geo coordinates couples. 

So we just arbitrarily take one.

In [None]:
geo_places.drop_duplicates(subset=['name'], inplace=True)

u.infos(geo_places)

In [None]:
# Prepare data
geo_places.reset_index(inplace=True)
geo_places['name'] = geo_places['name'].str.lower()
geo_places['name'] = geo_places['name'].str.replace('_', ' ')
geo_places['lat'] = geo_places['lat'].astype(float)
geo_places['lng'] = geo_places['lng'].astype(float)

# Record linkage
matches = find.find_geoplaces(geo_places, 'index')

matches.sort_values('new_name')

Questions: 
- Comment on résoud le RL?
- Pour les comparaisonss acceptée, si les coordonnées sont différentes, on ajoute des nouvelles ou on laisse les existantes?
- Si on ajoute, on ajoute au niveau de la présence ou on ajoute une nouvelle présence?