In [30]:
%load_ext autoreload
%autoreload 2

env = 'prod'
pk_project = -1
execute = False
metadata_str = ''
import_manner = 'one-shot' # 'batch'

import os
import pandas as pd
import numpy as np
from datetime import datetime
import duckdb
import plotly.express as px

import geovpylib.analysis as a
import geovpylib.database as db
import geovpylib.graphs as graphs
import geovpylib.pks as pks
import geovpylib.recordlinkage as rl
import geovpylib.sparql as sparql
import geovpylib.utils as u

eta = u.Eta()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
collective_actor_path = '../../data/bhp/collective-actor.csv'

# Fetch table from database

In [32]:
db.connect_external(os.environ.get('YELLOW_BHP'))
collective_actor = db.query('select * from bhp.collective_actor')

[DB] Connecting to PGSQL Database ... Connected!


In [33]:
collective_actor.drop(columns=['standard_text_property', 'count_text_property', 'concat_names', 'concat_coac'], inplace=True)
collective_actor.drop(columns=['creator', 'modifier', 'creation_time', 'modification_time'], inplace=True)
a.set_types(collective_actor, {
    'notes_begin': 'int',
    'certainty_begin': 'int',
    'notes_end': 'int',
    'certainty_end': 'int',
    'notes': 'string',
    'fk_abob_type_collective_actor': 'int',
    'begin_year': 'int',
    'end_year': 'int'
})

## Filter only wanted rows

In [34]:
len_before = len(collective_actor)
print(f'Rows number before filter: {len_before}')

collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('\[à identifier\]')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('\[ne pas importer\]')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('DOUBLON')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('Doublon')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('réutiliser')]
collective_actor = collective_actor[~collective_actor['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(collective_actor)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 22717
Rows number after filter: 22027 (690 have been removed)


In [35]:
collective_actor

Unnamed: 0,pk_collective_actor,notes_begin,certainty_begin,notes_end,certainty_end,notes,fk_abob_type_collective_actor,begin_year,end_year,concat_standard_name
0,14725,2,1,,,,1051,1969,,Parti Socialiste (PS)
1,9712,2,1,2,1,,1051,1945,1998,Conseil national du patronat français (CNPF)
2,13649,,1,,1,,,,,Parlement des États de Bourgogne
3,14723,,,,,,1051,1477,1789,Bailliage de Chalon
4,14726,2,1,,,,1051,1971,,Université Paris 1 (Panthéon-Sorbonne)
...,...,...,...,...,...,...,...,...,...,...
22707,10200,,1,,1,,1051,,,Borgnis D. & Cie
22708,22801,2,1,,1,,1051,1881,,Ouest algérien (Compagnie de chemin de fer)
22709,10246,1,1,,1,,1051,1882,,Rosset (S.A.)(Fabrique de crêpes et grenadines)
22710,14264,1,1,,,,1051,1882,,Araud neveu et Eyraud C.et C°


## Filter by type

## Remove unwanted characters

In [48]:
collective_actor.replace({'\r\n':''}, regex=True, inplace=True)
collective_actor.replace({'<p>':''}, regex=True, inplace=True)
collective_actor.replace({'</p>':''}, regex=True, inplace=True)

collective_actor.notes = collective_actor.notes.str.replace('<p>', '')
collective_actor.notes = collective_actor.notes.str.replace('</p>', '')
collective_actor.notes = collective_actor.notes.str.replace('\r', '')
collective_actor.notes = collective_actor.notes.str.replace('\n', ' ')

u.remove_binary_chars(collective_actor)

## Discovery

In [49]:
a.discover(collective_actor,6)

Columns contain:
Total number of rows: 22027
  -           "pk_collective_actor":   0.00% empty - 22027 (100.00%) uniques (eg: 14725; 9712; 13649; 14723; 14726; 14727)
  -          "concat_standard_name":   0.00% empty - 21540 ( 97.79%) uniques (eg: Parti Soci...; Conseil na...; Parlement ...; Bailliage ...; Université...; La vie des...)
  - "fk_abob_type_collective_actor":   3.50% empty -     5 (  0.02%) uniques (eg: 1051; <NA>; 1052; 153; 1053)
  -               "certainty_begin":   3.60% empty -     4 (  0.02%) uniques (eg: 1; <NA>; 3; 2)
  -                 "certainty_end":   4.91% empty -     4 (  0.02%) uniques (eg: <NA>; 1; 3; 2)
  -                    "begin_year":  52.06% empty -   806 (  3.66%) uniques (eg: 1969; 1945; <NA>; 1477; 1971; 2007)
  -                   "notes_begin":  68.67% empty -     6 (  0.03%) uniques (eg: 2; <NA>; 3; 1; 4; 5)
  -                      "end_year":  69.00% empty -   327 (  1.48%) uniques (eg: <NA>; 1998; 1789; 1818; 2005; 1791)
  -             

## Save data

In [38]:
u.save_df(collective_actor, collective_actor_path)