In [1]:
named_place_path = '../../data/bhp/named_place.csv'

In [2]:
import os, datetime
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a
import geovpylib.database as db

db.connect_external(os.environ.get('YELLOW_BHP'))
eta = u.Eta()

[DB] Requests will not be executed
[DB] Connecting to PGSQL Database ... Connected!


## Fetch full table

In [3]:
named_place = db.query('select * from bhp.named_place')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:

In [4]:
named_place.drop(columns=['concat_classes', 'concat_names', 'count_text_property', 'standard_text_property'], inplace=True)
named_place.drop(columns=['historical_period', 'comment_end_year', 'comment_begin_year'], inplace=True)

## Filter only wanted rows

Some of the rows has been identified to not be imported (see this [wiki page](https://github.com/geovistory/symogih/wiki/Liste-des-balises-des-entit%C3%A9s-%C3%A0-ne-pas-importer)).

In [5]:
len_before = len(named_place)
print(f'Rows number before filter: {len_before}')

named_place = named_place[~named_place['concat_standard_name'].str.contains('\[à identifier\]')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('\[ne pas importer\]')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('DOUBLON')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('Doublon')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('réutiliser')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(named_place)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 127466
Rows number after filter: 127420 (46 have been removed)


## Remove binary characters

In [6]:
named_place.replace({'\r\n':''}, regex=True, inplace=True)
named_place.replace({'<p>':''}, regex=True, inplace=True)
named_place.replace({'</p>':''}, regex=True, inplace=True)

u.remove_binary_chars(named_place)

## Discovery

In [7]:
a.discover(named_place, uniq_ex_nb=5)

Columns contain:
Total number of rows: 127420
  -       "pk_named_place":   0.00% empty - 127420 (100.00%) uniques (eg: 15922; 15923; 15924; 15925; 15926)
  -          "concat_napl":   0.00% empty - 127420 (100.00%) uniques (eg: NaPl15922; NaPl15923; NaPl15924; NaPl15925; NaPl15926)
  - "concat_standard_name":   0.00% empty -  84574 ( 66.37%) uniques (eg: Vèbre; Ventenac; Verdun; Vernajoul; Vernaux)
  -                "notes":   0.00% empty -   1199 (  0.94%) uniques (eg: Commune ex...; None; Etats-Unis; Bolivie; Pas de Cal...)
  -    "fk_abob_type_napl":   0.00% empty -      6 (  0.00%) uniques (eg: 697.0; 156.0; 696.0; 245.0; 698.0)
  -          "notes_begin":   0.00% empty -      5 (  0.00%) uniques (eg: None; 4; 2; 3; 1)
  -    "modification_time":   0.00% empty -   1499 (  1.18%) uniques (eg: 2014-12-03...; 2014-12-03...; 2014-12-06...; 2014-12-07...; 2014-12-06...)
  -        "creation_time":   0.00% empty -   2541 (  1.99%) uniques (eg: 2009-10-28...; 2010-03-03...; 2010-03-03..

## Handle data

## Type parsing

In [8]:
named_place['certainty_end'].replace(' ', None, inplace=True)
named_place['certainty_end'].replace('None', None, inplace=True)
named_place['certainty_begin'].replace('None', None, inplace=True)
a.set_types(named_place, {
        "pk_named_place": "int",
           "concat_napl": "string",
  "concat_standard_name": "string",
     "fk_abob_type_napl": "int",
     "modification_time": "datetime",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
    "standard_longitude": "float",
     "standard_latitude": "float",
                 "notes": "string",
       "certainty_begin": "int",
         "certainty_end": "int",
            "begin_year": "int",
           "notes_begin": "string",
              "end_year": "int",
             "notes_end": "string"
})

## Save data

In [9]:
u.write_df(named_place, named_place_path)