In [1]:
%load_ext autoreload
%autoreload 2

env = 'staging'
pk_project = 0
debug = True
execute = False

import pandas as pd 

import geovpylib.utils as u
import geovpylib.analysis as a
import geovpylib.find as find
import geovpylib.database as db

db.connect(env, pk_project, execute=execute)

Requests will not be executed
=== Setting STAGING environment ===
>> Connecting to PGSQL Database ... Connected!


# BHP named places / Geovistory geographical places record linkage

## Prepare named places from BHP fetched and cleaned data

### Read data

In [2]:
named_place = u.read_df('../../../data/bhp/named_place.csv')

a.set_types(named_place, {
        "pk_named_place": "int",
           "concat_napl": "string",
  "concat_standard_name": "string",
     "fk_abob_type_napl": "int",
     "modification_time": "datetime",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
    "standard_longitude": "float",
     "standard_latitude": "float",
                 "notes": "string",
         "certainty_end": "int",
       "certainty_begin": "int",
            "begin_year": "int",
           "notes_begin": "string",
              "end_year": "int",
             "notes_end": "string"
})

# u.infos(named_place)

In [3]:
named_place_name = u.read_df('../../../data/bhp/named_place_name.csv')

a.set_types(named_place_name, {
   "pk_named_place_name": "int",
           "concat_plna": "string",
      "is_standard_name": "boolean",
        "fk_named_place": "int",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
                  "name": "string",
              "lang_iso": "string",
     "modification_time": "datetime",
                 "notes": "string",
                "number": "int",
     "complement_number": "string",
"fk_abob_napl_name_type": "int",
    "comment_begin_year": "string",
      "comment_end_year": "string",
            "begin_date": "datetime",
              "end_date": "datetime"
})     

# u.infos(named_place_name)

  df = pd.read_csv(path, sep=';', quoting=2)


In [4]:
named_place_text_property = u.read_df('../../../data/bhp/named_place_text_property.csv')

a.set_types(named_place_text_property, {
    "pk_named_place_text_property": "int",
                   "property_type": "string",
                            "text": "string",
                  "fk_named_place": "int",
                   "creation_time": "datetime",
                     "concat_nptp": "string",
                         "creator": "int",
                        "modifier": "int",
                   "lang_iso_code": "string",
               "modification_time": "datetime",
                           "notes": "string",
})       

# u.infos(named_place_text_property)

### Create places table

In [5]:
# Names
names = pd.DataFrame()
names['pk'] = named_place_name['pk_named_place_name']
names['name'] = named_place_name['name'].str.lower()
names['name'] = names['name'].str.strip()
names.dropna(inplace=True)

In [6]:
# Geo coordinates
geocoord = pd.DataFrame()
geocoord['pk'] = named_place['pk_named_place']
geocoord['lat'] = named_place['standard_latitude']
geocoord['lng'] = named_place['standard_longitude']
geocoord.dropna(inplace=True)

In [7]:
# Type
type = pd.DataFrame()
type['pk'] = named_place['pk_named_place']
type['type'] = named_place['fk_abob_type_napl'].astype(object)
type['type'] = type['type'].replace(156, 'Lieu habité')
type['type'] = type['type'].replace(245, 'Région géographique')
type['type'] = type['type'].replace(695, 'Adresse')
type['type'] = type['type'].replace(696, 'Élément géographique naturel')
type['type'] = type['type'].replace(697, 'Territoire')
type['type'] = type['type'].replace(698, 'Surface d\'infrastructure')

In [8]:
place = pd.DataFrame()
place['pk'] = named_place['pk_named_place']
place = place.merge(names, on='pk', how='left')
place = place.merge(geocoord, on='pk', how='left')
place = place.merge(type, on='pk', how='left')

u.infos(place, random=True)

Shape:  (127420, 5) - extract:


Unnamed: 0,pk,name,lat,lng,type
111219,89924,munich,33.748889,-84.387778,Lieu habité
92650,74523,quelmes,50.4541,2.68565,Lieu habité
83279,68271,sainte-suzanne-sur-vire,49.5169,-1.35733,Lieu habité
120310,11781,karavukovo,50.524167,26.257778,Lieu habité
55701,97574,,45.7496,4.845192,Adresse


## Record linkage

In [17]:
record_linkage = find.find_geoplaces(place, 'pk', distance=100, jobs=10)

Checking data integrity... Done
Find all geographical places in Geovistory... 14824 found.
Finding similar geographical places is done - Elapsed: [00h01'33]                   


### Join information to help record linkage

In [30]:
record_linkage.rename(columns={'pk':'pk_bhp','pk_entity': 'pk_gv', 'new_name':'bhp_name', 'new_lat':'bhp_lat', 'new_lng':'bhp_lng'}, inplace=True)


#### Definition

In [40]:
# BHP
named_place_text_property = u.read_df('../../../data/bhp/named_place_text_property.csv')[['fk_named_place', 'text']]
named_place_text_property.rename(columns={'text': 'bhp_definition'}, inplace=True)

# Geovistory
pks_entity = "(" + ",".join(record_linkage['pk_gv'].astype(str).tolist()) + ")"
gv_def = db.query(f"""
    select 
        r.pk_entity,
        a.string as gv_definition
    from information.resource r
    left join information.statement s1 on s1.fk_subject_info = r.pk_entity and s1.fk_property = 1762
    inner join information.statement s2 on s2.fk_subject_info = s1.fk_object_info and s2.fk_property = 1864
    inner join information.appellation a on a.pk_entity = s2.fk_object_info
    where r.pk_entity in {pks_entity}
""")

record_linkage = record_linkage.merge(named_place_text_property, left_on='pk_bhp', right_on='fk_named_place', how='left').drop(columns=['fk_named_place'])
record_linkage = record_linkage.merge(gv_def, left_on='pk_gv', right_on='pk_entity', how='left').drop(columns=['pk_entity'])

u.infos(record_linkage)

Shape:  (1663, 11) - extract:


Unnamed: 0,pk_bhp,pk_gv,bhp_name,gv_name,bhp_lat,gv_lat,bhp_lng,gv_lng,distance,bhp_definition,gv_definition
0,51,80681,berd,bern,47.0,46.94809,8.014167,7.44744,43.4,,
1,400,3155988,kossovo,kosovo,40.196389,,45.185,,,Armenia,Pays de l'ex-Yougoslavie.
2,862,3162490,knin,nin,44.368333,44.239722,16.3975,15.181111,97.8,Bosnia and Herzegovina,.
3,886,2026051,sisak,sisak,44.764444,45.485077,16.656667,16.373116,83.2,Bosnia and Herzegovina,Ville en Croatie
4,1225,2032213,pello,tello,50.206111,,15.812222,,,Czech Republic,Hauptort des Stadtstaates Lagash


In [42]:
u.save_df(record_linkage, '../../../data/record-linkage-bhp-named-place-geov-geo-places.csv')