In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
env = 'staging'
pk_project = 153
debug = True
execute = False

import pandas as pd
import duckdb
import threading
from icecream import ic

import geovpylib.utils as u
import geovpylib.database as db
import geovpylib.pks as pks
import geovpylib.sparql as sparql
import geovpylib.graphs as graphs
import geovpylib.find as find

eta = u.Eta()
ic.configureOutput(prefix='debug| ')
if debug: ic.enable()
else: ic.disable()


db.connect(env, pk_project, execute=execute)

Requests will not be executed
=== Setting STAGING environment ===
>> Connecting to PGSQL Database ... Connected!


# Record linkage of all HLS births and deaths places

## Fetch data

In [28]:
sparql.init('https://query.wikidata.org/sparql')
birthplaces = sparql.query("""
    SELECT ?place ?placeLabel ?typeLabel ?lat ?lng
    WHERE {
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P19 ?place .
        ?place wdt:P31 ?type .
        ?place p:P625 ?coords .
        ?coords psv:P625 ?coordinate_node.
        ?coordinate_node wikibase:geoLatitude ?lat .
        ?coordinate_node wikibase:geoLongitude ?lng .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,fr,it". }
    }
""")

deathplaces = sparql.query("""
    SELECT ?place ?placeLabel ?typeLabel ?lat ?lng
    WHERE {
        ?uri_wikidata wdt:P902 ?uri_hls .
        ?uri_wikidata wdt:P31 wd:Q5 .
        ?uri_wikidata wdt:P20 ?place .
        ?place wdt:P31 ?type .
        ?place p:P625 ?coords .
        ?coords psv:P625 ?coordinate_node.
        ?coordinate_node wikibase:geoLatitude ?lat .
        ?coordinate_node wikibase:geoLongitude ?lng .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en,fr,it". }
    }
""")

# Merge birth places and deah places                           
places = pd.concat([birthplaces, deathplaces])
places.drop_duplicates(inplace=True)

# Rename columns
places.columns = ['uri', 'lat', 'lng', 'name', 'type']

# Aggregate types
places['types'] = places.groupby('uri')['type'].transform(lambda x: ','.join(x))
places.drop(columns=['type'], inplace=True)
places.drop_duplicates(subset=['uri'], inplace=True)

# Reorder columns
places = places[['name', 'uri', 'lat', 'lng', 'types']]
places = u.parse_df(places)

u.infos(places)

Shape:  (4375, 5) - extract:


Unnamed: 0,name,uri,lat,lng,types
0,Zürich,http://www.wikidata.org/entity/Q72,47.374444444444,8.5411111111111,"municipality of Switzerland,college town,canto..."
4,Basel,http://www.wikidata.org/entity/Q78,47.560555555556,7.5905555555556,"municipality of Switzerland,border town,colleg..."
10,Naples,http://www.wikidata.org/entity/Q2634,40.833333333333,14.25,"city,comune of Italy,big city"
13,Ulm,http://www.wikidata.org/entity/Q3012,48.39841,9.99155,"city,district capital,major regional center,co..."
21,Lucerne,http://www.wikidata.org/entity/Q4191,47.0523,8.3059,"municipality of Switzerland,cantonal capital o..."


## Record linkage

In [29]:
places['name'] = places['name'].str.lower()
places.reset_index(inplace=True, drop=True)
places.reset_index(inplace=True)

u.infos(places)

Shape:  (4375, 6) - extract:


Unnamed: 0,index,name,uri,lat,lng,types
0,0,zürich,http://www.wikidata.org/entity/Q72,47.374444444444,8.5411111111111,"municipality of Switzerland,college town,canto..."
1,1,basel,http://www.wikidata.org/entity/Q78,47.560555555556,7.5905555555556,"municipality of Switzerland,border town,colleg..."
2,2,naples,http://www.wikidata.org/entity/Q2634,40.833333333333,14.25,"city,comune of Italy,big city"
3,3,ulm,http://www.wikidata.org/entity/Q3012,48.39841,9.99155,"city,district capital,major regional center,co..."
4,4,lucerne,http://www.wikidata.org/entity/Q4191,47.0523,8.3059,"municipality of Switzerland,cantonal capital o..."


In [37]:
matchings = find.find_geoplaces(places, 'index', jobs=13)

u.infos(matchings)

Checking data integrity... Done
Find all geographical places in Geovistory... 14824 found.
Finding similar geographical places is done - Elapsed: [00h00'02]                   
Shape:  (255, 9) - extract:


Unnamed: 0,index,pk_gv,new_name,gv_name,new_lat,gv_lat,new_lng,gv_lng,distance
0,0,25494,zürich,zürich,47.374444,47.366667,8.541111,8.55,1.1
1,0,1739255,zürich,zürich,47.374444,,8.541111,,
2,0,3230970,zürich,zürich,47.374444,,8.541111,,
3,1,80974,basel,basel,47.560556,47.5584,7.590556,7.5733,1.3
4,1,80974,basel,basel,47.560556,47.55814,7.590556,7.58769,0.3
