In [1]:
%load_ext autoreload
%autoreload 2

env = 'staging'
pk_project = 0
execute = False

import pandas as pd
import numpy as np
import duckdb
import os

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

# BHP named places / Geovistory geographical places record linkage

## Prepare named places from BHP fetched and cleaned data

### Read data

In [2]:
named_place = u.read_df('../../../data/bhp/named_place.csv')

a.set_types(named_place, {
        "pk_named_place": "int",
           "concat_napl": "string",
  "concat_standard_name": "string",
     "fk_abob_type_napl": "int",
     "modification_time": "datetime",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
    "standard_longitude": "float",
     "standard_latitude": "float",
                 "notes": "string",
         "certainty_end": "int",
       "certainty_begin": "int",
            "begin_year": "int",
           "notes_begin": "string",
              "end_year": "int",
             "notes_end": "string"
})

# a.infos(named_place)

In [3]:
named_place_name = u.read_df('../../../data/bhp/named_place_name.csv')

a.set_types(named_place_name, {
   "pk_named_place_name": "int",
           "concat_plna": "string",
      "is_standard_name": "boolean",
        "fk_named_place": "int",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
                  "name": "string",
              "lang_iso": "string",
     "modification_time": "datetime",
                 "notes": "string",
                "number": "int",
     "complement_number": "string",
"fk_abob_napl_name_type": "int",
    "comment_begin_year": "string",
      "comment_end_year": "string",
            "begin_date": "datetime",
              "end_date": "datetime"
})     

# a.infos(named_place_name)

In [4]:
named_place_text_property = u.read_df('../../../data/bhp/named_place_text_property.csv')

a.set_types(named_place_text_property, {
    "pk_named_place_text_property": "int",
                   "property_type": "string",
                            "text": "string",
                  "fk_named_place": "int",
                   "creation_time": "datetime",
                     "concat_nptp": "string",
                         "creator": "int",
                        "modifier": "int",
                   "lang_iso_code": "string",
               "modification_time": "datetime",
                           "notes": "string",
})       

# a.infos(named_place_text_property)

### Create places table

In [5]:
# Names
names = pd.DataFrame()
names['pk'] = named_place_name['fk_named_place']
names['name'] = named_place_name['name'].str.lower()
names['name'] = names['name'].str.strip()
names.dropna(inplace=True)

In [6]:
# Geo coordinates
geocoord = pd.DataFrame()
geocoord['pk'] = named_place['pk_named_place']
geocoord['lat'] = named_place['standard_latitude']
geocoord['lng'] = named_place['standard_longitude']
geocoord.dropna(inplace=True)

In [7]:
# Type
type = pd.DataFrame()
type['pk'] = named_place['pk_named_place']
type['type'] = named_place['fk_abob_type_napl'].astype(object)
type['type'] = type['type'].replace(156, 'Lieu habité')
type['type'] = type['type'].replace(245, 'Région géographique')
type['type'] = type['type'].replace(695, 'Adresse')
type['type'] = type['type'].replace(696, 'Élément géographique naturel')
type['type'] = type['type'].replace(697, 'Territoire')
type['type'] = type['type'].replace(698, 'Surface d\'infrastructure')

In [8]:
place = pd.DataFrame()
place['pk'] = named_place['pk_named_place']
place = place.merge(names, on='pk', how='left')
place = place.merge(geocoord, on='pk', how='left')
place = place.merge(type, on='pk', how='left')

a.infos(place, random=True)

Shape:  (128430, 5) - extract:


Unnamed: 0,pk,name,lat,lng,type
14977,29879,suc-sur-erdre,,,Territoire
45161,12912,magnieu,,,Territoire
41379,99787,philippe fabia rue 31,45.72749,4.871214,Adresse
3766,19208,marais,,,Territoire
123586,127456,albissola marina,44.326606,8.501224,Lieu habité


In [9]:
place[place['name'] == 'berd']

Unnamed: 0,pk,name,lat,lng,type
31223,352,berd,40.88,45.390278,Lieu habité


#### Filter out "Surface de commune" et "Surface de canton"

According, to the [GitHub issue](https://github.com/geovistory/symogih/issues/2), we exclude them from the record linkage.
The information is found at `classify_napl.fk_abstract_object = 161 or 163`

In [10]:
# Fetch data
db.connect_external(os.environ.get('YELLOW_BHP'))
classify_napl = db.query('select * from bhp.classify_napl')[['fk_named_place', 'fk_abstract_object']]
# a.infos(classify_napl)

>> Connecting to PGSQL Database ... Connected!


In [11]:
# Merge information
place = place.merge(classify_napl, left_on="pk", right_on="fk_named_place", how='left').drop(columns=['fk_named_place'])
a.set_types(place, {'fk_abstract_object':'int'})
a.infos(place)

Shape:  (128445, 6) - extract:


Unnamed: 0,pk,name,lat,lng,type,fk_abstract_object
0,15922,vbre,,,Territoire,161
1,15923,ventenac,,,Territoire,161
2,15924,verdun,,,Territoire,161
3,15925,vernajoul,,,Territoire,161
4,15926,vernaux,,,Territoire,161


In [12]:
# place.groupby('fk_abstract_object').count()
pd.isna(place['fk_abstract_object']).sum()

50775

In [13]:
nb = ((place['fk_abstract_object'] == 161) | (place['fk_abstract_object'] == 163)).sum()
print('Number of 161 or 163:', nb)

Number of 161 or 163: 40755


In [14]:
place = place[pd.isna(place['fk_abstract_object']) | ~((place['fk_abstract_object'] == 161) | (place['fk_abstract_object'] == 163))]
place.drop(columns=['fk_abstract_object'], inplace=True)
a.infos(place)

Shape:  (87690, 5) - extract:


Unnamed: 0,pk,name,lat,lng,type
25,89944,saint-florent-sur-cher,46.983333,2.25,Lieu habité
26,89946,dra ben khedda,36.736667,3.956111,Lieu habité
27,89946,mirabeau,36.736667,3.956111,Lieu habité
28,89942,antsiranana,-12.266667,49.283056,Lieu habité
29,89942,diego suarez,-12.266667,49.283056,Lieu habité


## Record linkage

In [15]:
db.connect_geovistory(env, pk_project, execute=execute)
record_linkage = rl.find_geoplaces(place, 'pk', distance=50, jobs=10)

Requests will not be executed
>> Connecting to STAGING Database ... Connected!
Checking data integrity... Done
Find all geographical places in Geovistory... 14828 found, in 5 seconds.
Finding similar geographical places is done - Elapsed: [00h01'22]                   


### Join information to help record linkage

In [16]:
record_linkage.rename(columns={'pk':'pk_bhp','pk_entity': 'pk_gv', 'new_name':'bhp_name', 'new_lat':'bhp_lat', 'new_lng':'bhp_lng'}, inplace=True)

#### Definition

In [17]:
# BHP
named_place_text_property = u.read_df('../../../data/bhp/named_place_text_property.csv')[['fk_named_place', 'text']]
named_place_text_property.rename(columns={'text': 'bhp_definition'}, inplace=True)

# Geovistory
pks_entity = "(" + ",".join(record_linkage['pk_gv'].astype(str).tolist()) + ")"
gv_def = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_definition
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1762
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = 1864
    inner join information.appellation a on a.pk_entity = s2.fk_object_info
    where r.pk_entity in {pks_entity}
""")

record_linkage = record_linkage.merge(named_place_text_property, left_on='pk_bhp', right_on='fk_named_place', how='left').drop(columns=['fk_named_place'])
record_linkage = record_linkage.merge(gv_def, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])

#### Types

In [18]:
# BHP
record_linkage = record_linkage.merge(place[['pk', 'type']], left_on='pk_bhp', right_on='pk', how='left').drop(columns=['pk']).rename(columns={'type':'bhp_type'})

# Geovistory
pks_entity = "(" + ",".join(record_linkage['pk_gv'].astype(str).tolist()) + ")"
gv_types = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_type
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1110
    inner join information.statement s2 on s2.fk_object_info = s1.fk_object_info and s2.fk_property = 1111
    inner join information.statement s3 on s3.fk_subject_info = s2.fk_subject_info and s3.fk_property = 1113
    inner join information.statement s4 on s4.fk_subject_info = s2.fk_subject_info and s4.fk_property = 1112 and s4.fk_object_info = {pks.languages.english}
    inner join information.appellation a on a.pk_entity = s3.fk_object_info
    where r.pk_entity in {pks_entity}
""")        
gv_types.drop_duplicates(subset=['pk_entity'], inplace=True)

record_linkage = record_linkage.merge(gv_types, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])
record_linkage.drop_duplicates(inplace=True)

In [19]:
a.infos(record_linkage)

Shape:  (838, 13) - extract:


Unnamed: 0,pk_bhp,pk_gv,bhp_name,gv_name,bhp_lat,gv_lat,bhp_lng,gv_lng,distance,bhp_definition,gv_definition,bhp_type,gv_type
0,1,205134,albanie,albania,41.0,41.0,20.0,20.0,0.0,,,Territoire,Geographical area
1,3,6201188,armenie,armenia,39.95,40.383333,44.83333,44.95,49.2,,.,Territoire,Geographical area
2,7,2220550,belgique,belgique,50.833333,,4.0,,,Territoire du nouvel Etat n de la rvolution de...,Pays européen,Territoire,
5,13,1485424,denmark,danmark,56.0,56.0,10.0,10.0,0.0,,.,Territoire,Geographical area
7,19,1876219,allemagne,allemagne,51.5,,10.5,,,,Pays d'Europe,Territoire,


## Save

In [20]:
u.save_df(record_linkage, '../../../data/record-linkage-bhp-named-place-geov-geo-places.csv')