In [12]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = -1
execute = False
metadata_str = ''
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
collective_actor_text_property_path = '../../data/bhp/collective-actor-text-property.csv'

## Fetch full table

In [14]:
db.connect_external(os.environ.get('YELLOW_BHP'))
collective_actor_text_property = db.query('select * from bhp.collective_actor_text_property')

[DB] Connecting to PGSQL Database ... Connected!


## Filter unwanted columns

In [15]:
collective_actor_text_property.drop(columns=['concat_catp'], inplace=True)
collective_actor_text_property.drop(columns=['creator', 'modifier', 'creation_time', 'modification_time'], inplace=True)


## Remove binary characters

In [16]:
collective_actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
collective_actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
collective_actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

u.remove_binary_chars(collective_actor_text_property)

## Discovery

In [17]:
a.discover(collective_actor_text_property, uniq_ex_nb=3)

Columns contain:
Total number of rows: 16687
  - "pk_collective_actor_text_property":   0.00% empty - 16687 (100.00%) uniques (eg: 8718; 8679; 5857)
  -                     "property_type":   0.00% empty -     4 (  0.02%) uniques (eg: notice; notice_web; complément)
  -                     "lang_iso_code":   0.00% empty -     3 (  0.02%) uniques (eg: ita; fra; None)
  -                              "text":   0.00% empty - 12153 ( 72.83%) uniques (eg: Comunità v...; Chambre sy...; Syndicat p...)
  -                             "notes":   0.00% empty -    11 (  0.07%) uniques (eg: None; Annexe du ...; Biver : in...)
  -               "fk_collective_actor":   0.00% empty - 16640 ( 99.72%) uniques (eg: 14779; 14283; 9139)


## Type parsing

In [18]:
a.set_types(collective_actor_text_property, {
    'pk_collective_actor_text_property': 'int',
                        'property_type': 'string',
                                 'text': 'text',
                  'fk_collective_actor': 'int',
                        'lang_iso_code': 'string',
                                'notes': 'string',
})     

## Handle data

In [19]:
collective_actor_text_property['property_type'] = collective_actor_text_property['property_type'].replace('notice_web', 'notice web')

# According to the wiki page, 'notice_web' and 'notice' would then be merged.
collective_actor_text_property['property_type'] = collective_actor_text_property['property_type'].replace('notice web', 'notice')

## Save data

In [20]:
u.save_df(collective_actor_text_property, collective_actor_text_property_path)