In [35]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = -1
execute = False
metadata_str = ''
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
collective_actor_name_path = '../../data/bhp/collective-actor-name.csv'

## Fetch full table

In [37]:
db.connect_external(os.environ.get('YELLOW_BHP'))
collective_actor_name = db.query('select * from bhp.collective_actor_name')

[DB] Connecting to PGSQL Database ... Connected!


## Filter unwanted columns

In [38]:
collective_actor_name.drop(columns=['concat_cana', 'name_number'], inplace=True)
collective_actor_name.drop(columns=['creator', 'modifier', 'creation_time', 'modification_time'], inplace=True)
a.set_types(collective_actor_name, {
    'name': 'string',
    'name_type': 'int',
    'lang_iso': 'string',
    'comment_begin_year': 'string',
    'comment_end_year': 'string',
    'notes': 'string',
    'fk_collective_actor': 'int',
    'begin_year': 'int',
    'end_year': 'int',
    'begin_month': 'int',
    'end_month': 'int',
    'begin_day': 'int',
    'end_day': 'int',
    'fk_abob_coac_name_type': 'int'
})

## Remove binary characters

In [39]:
collective_actor_name.replace({'\r\n':''}, regex=True, inplace=True)
collective_actor_name.replace({'<p>':''}, regex=True, inplace=True)
collective_actor_name.replace({'</p>':''}, regex=True, inplace=True)

collective_actor_name.notes = collective_actor_name.notes.str.replace('<p>', '')
collective_actor_name.notes = collective_actor_name.notes.str.replace('</p>', '')
collective_actor_name.notes = collective_actor_name.notes.str.replace('\r', '')
collective_actor_name.notes = collective_actor_name.notes.str.replace('\n', ' ')

u.remove_binary_chars(collective_actor_name)

## Handle data

### begin and end dates

We create 2 new columns, made of the joining of `begin_year`, `begin_month`, `begin_day` and `end_year`, `end_month`, `end_day`.

In [40]:
# Format available information
collective_actor_name['begin_year'] = collective_actor_name['begin_year'].astype(pd.Int64Dtype())
collective_actor_name['begin_month'] = collective_actor_name['begin_month'].astype(pd.Int64Dtype())
collective_actor_name['begin_day'] = collective_actor_name['begin_day'].astype(pd.Int64Dtype())
collective_actor_name['end_year'] = collective_actor_name['end_year'].astype(pd.Int64Dtype())
collective_actor_name['end_month'] = collective_actor_name['end_month'].astype(pd.Int64Dtype())
collective_actor_name['end_day'] = collective_actor_name['end_day'].astype(pd.Int64Dtype())

# Create new columns
collective_actor_name['begin_date'] = [(row['begin_year'], row['begin_month'], row['begin_day']) if pd.notna(row['begin_year']) else pd.NA for _, row in collective_actor_name.iterrows()]
collective_actor_name['end_date'] = [(row['end_year'], row['end_month'], row['end_day']) if pd.notna(row['end_year']) else pd.NA for _, row in collective_actor_name.iterrows()]

# Delete old columns
collective_actor_name.drop(columns=['begin_year', 'begin_month', 'begin_day', 'end_year', 'end_month', 'end_day'], inplace=True)

### lang_iso

In [41]:
collective_actor_name['lang_iso'].replace('   ', pd.NA, inplace=True)

## Discovery

In [42]:
a.discover(collective_actor_name, uniq_ex_nb=3)

Columns contain:
Total number of rows: 24429
  - "pk_collective_actor_name":   0.00% empty - 24429 (100.00%) uniques (eg: 5; 6; 8)
  -         "is_standard_name":   0.00% empty -     2 (  0.01%) uniques (eg: True; False)
  -                     "name":   0.00% empty - 23800 ( 97.43%) uniques (eg: Collège de...; Collège ro...; Compagnie ...)
  -      "fk_collective_actor":   0.00% empty - 22717 ( 92.99%) uniques (eg: 10; 11; 13)
  -                 "lang_iso":  56.17% empty -    11 (  0.05%) uniques (eg: <NA>; fra; ita)
  -   "fk_abob_coac_name_type":  63.03% empty -     5 (  0.02%) uniques (eg: <NA>; 1253; 1270)
  -       "comment_begin_year":  72.66% empty -    29 (  0.12%) uniques (eg: <NA>; ; 1874)
  -         "comment_end_year":  72.79% empty -    15 (  0.06%) uniques (eg: <NA>; ; 1869)
  -                    "notes":  72.79% empty -    30 (  0.12%) uniques (eg: <NA>; ; Laurium ou...)
  -               "begin_date":  94.51% empty -   368 (  1.51%) uniques (eg: <NA>; (1577, <NA...; 

## Save data

In [43]:
u.save_df(collective_actor_name, collective_actor_name_path)