In [5]:
named_place_path = '../../data/named_place.csv'

In [6]:
import os
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a

u.db_connect(os.environ.get('YELLOW_BHP'))

>> Connecting to PGSQL Database ... Connected!


## Fetch full table

In [7]:
named_place = u.db_execute('select * from bhp.named_place')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:

In [8]:
named_place.drop(columns=['concat_classes', 'concat_names', 'count_text_property', 'standard_text_property'], inplace=True)
named_place.drop(columns=['historical_period', 'comment_end_year', 'comment_begin_year'], inplace=True)

## Filter only wanted rows

Some of the rows has been identified to not be imported (see this [wiki page](https://github.com/geovistory/symogih/wiki/Liste-des-balises-des-entit%C3%A9s-%C3%A0-ne-pas-importer)).

In [9]:
len_before = len(named_place)
print(f'Rows number before filter: {len_before}')

named_place = named_place[~named_place['concat_standard_name'].str.contains('\[à identifier\]')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('\[ne pas importer\]')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('DOUBLON')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('Doublon')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('réutiliser')]
named_place = named_place[~named_place['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(named_place)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 127466
Rows number after filter: 127420 (46 have been removed)


## Remove binary characters

In [10]:
named_place.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
named_place.replace({'\r\n':''}, regex=True, inplace=True)
named_place.replace({'<p>':''}, regex=True, inplace=True)
named_place.replace({'</p>':''}, regex=True, inplace=True)

## Discovery

In [11]:
a.discover(named_place, uniq_ex_nb=5)

Columns contain:
Total number of rows: 127420
  -       "pk_named_place":   0.00% empty - 127420 (100.00%) uniques (eg: 15922; 15923; 15924; 15925; 15926)
  -          "concat_napl":   0.00% empty - 127420 (100.00%) uniques (eg: NaPl15922; NaPl15923; NaPl15924; NaPl15925; NaPl15926)
  - "concat_standard_name":   0.00% empty -  84480 ( 66.30%) uniques (eg: Vbre; Ventenac; Verdun; Vernajoul; Vernaux)
  -    "fk_abob_type_napl":   0.00% empty -      6 (  0.00%) uniques (eg: 697.0; 156.0; 696.0; 245.0; 698.0)
  -    "modification_time":   0.00% empty -   1499 (  1.18%) uniques (eg: 2014-12-03...; 2014-12-03...; 2014-12-06...; 2014-12-07...; 2014-12-06...)
  -        "creation_time":   0.00% empty -   2541 (  1.99%) uniques (eg: 2009-10-28...; 2010-03-03...; 2010-03-03...; 2010-03-02...; 2010-03-02...)
  -              "creator":   0.00% empty -     50 (  0.04%) uniques (eg: 4.0; 26.0; 11.0; 70.0; 29.0)
  -             "modifier":   0.00% empty -     48 (  0.04%) uniques (eg: 4.0; 26.0; 11.

## Handle data

## Type parsing

In [13]:
named_place['certainty_end'].replace(' ', None, inplace=True)
a.set_types(named_place, {
        "pk_named_place": "int",
           "concat_napl": "string",
  "concat_standard_name": "string",
     "fk_abob_type_napl": "int",
     "modification_time": "datetime",
         "creation_time": "datetime",
               "creator": "int",
              "modifier": "int",
    "standard_longitude": "float",
     "standard_latitude": "float",
                 "notes": "string",
         "certainty_end": "int",
       "certainty_begin": "int",
            "begin_year": "int",
           "notes_begin": "string",
              "end_year": "int",
             "notes_end": "string"
})

## Save data

In [14]:
u.save_df(named_place, named_place_path)