In [25]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = -1
execute = False
metadata_str = ''
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
collective_actor_path = '../../data/bhp/collective-actor.csv'

# Fetch table from database

In [27]:
db.connect_external(os.environ.get('YELLOW_BHP'))
collective_actor = db.query('select * from bhp.collective_actor')

[DB] Connecting to PGSQL Database ... Connected!


In [28]:
collective_actor.drop(columns=['standard_text_property', 'count_text_property', 'concat_names', 'concat_coac'], inplace=True)
collective_actor.drop(columns=['creator', 'modifier', 'creation_time', 'modification_time'], inplace=True)
a.set_types(collective_actor, {
    'notes_begin': 'int',
    'certainty_begin': 'int',
    'notes_end': 'int',
    'certainty_end': 'int',
    'notes': 'string',
    'fk_abob_type_collective_actor': 'int',
    'begin_year': 'int',
    'end_year': 'int'
})

## Filter only wanted rows

In [29]:
len_before = len(collective_actor)
print(f'Rows number before filter: {len_before}')

collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('\[à identifier\]')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('\[ne pas importer\]')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('DOUBLON')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('Doublon')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('réutiliser')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(collective_actor)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 22717
Rows number after filter: 22027 (690 have been removed)


## Filter by type

In [30]:
collective_actor = collective_actor[(collective_actor['fk_abob_type_collective_actor'] == 1051) | (collective_actor['fk_abob_type_collective_actor'] == 1052) | pd.isna(collective_actor['fk_abob_type_collective_actor'])]

## Remove unwanted characters

In [31]:
collective_actor.replace({'\r\n':''}, regex=True, inplace=True)
collective_actor.replace({'<p>':''}, regex=True, inplace=True)
collective_actor.replace({'</p>':''}, regex=True, inplace=True)

collective_actor.notes = collective_actor.notes.str.replace('<p>', '')
collective_actor.notes = collective_actor.notes.str.replace('</p>', '')
collective_actor.notes = collective_actor.notes.str.replace('\r', '')
collective_actor.notes = collective_actor.notes.str.replace('\n', ' ')

u.remove_binary_chars(collective_actor)

## Discovery

In [32]:
a.discover(collective_actor,6)

Columns contain:
Total number of rows: 22009
  -           "pk_collective_actor":   0.00% empty - 22009 (100.00%) uniques (eg: 14725; 9712; 13649; 14723; 14726; 14727)
  -          "concat_standard_name":   0.00% empty - 21522 ( 97.79%) uniques (eg: Parti Soci...; Conseil na...; Parlement ...; Bailliage ...; Université...; La vie des...)
  - "fk_abob_type_collective_actor":   3.51% empty -     3 (  0.01%) uniques (eg: 1051; <NA>; 1052)
  -               "certainty_begin":   3.60% empty -     4 (  0.02%) uniques (eg: 1; <NA>; 3; 2)
  -                 "certainty_end":   4.92% empty -     4 (  0.02%) uniques (eg: <NA>; 1; 3; 2)
  -                    "begin_year":  52.06% empty -   805 (  3.66%) uniques (eg: 1969; 1945; <NA>; 1477; 1971; 2007)
  -                   "notes_begin":  68.69% empty -     6 (  0.03%) uniques (eg: 2; <NA>; 3; 1; 4; 5)
  -                      "end_year":  68.98% empty -   326 (  1.48%) uniques (eg: <NA>; 1998; 1789; 1818; 2005; 1791)
  -                     "no

## Save data

In [33]:
u.save_df(collective_actor, collective_actor_path)