In [1]:
actor_path = '../../data/actor.csv'

In [2]:
import os
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a

u.db_connect(os.environ.get('YELLOW_BHP'))

>> Connecting to PGSQL Database ... Connected!


## Fetch full table

In [3]:
actor = u.db_execute('select * from bhp.actor')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:
- `standard_text_property`
- `count_text_property`
- `concat_names`

In [4]:
actor.drop(columns=['standard_text_property', 'count_text_property', 'concat_names'], inplace=True)

## Filter only wanted rows

Some of the rows has been identified to not be imported (see this [wiki page](https://github.com/geovistory/symogih/wiki/Liste-des-balises-des-entit%C3%A9s-%C3%A0-ne-pas-importer)).

In [5]:
len_before = len(actor)
print(f'Rows number before filter: {len_before}')

actor = actor[~actor['concat_standard_name'].str.contains('\[à identifier\]')]
actor = actor[~actor['concat_standard_name'].str.contains('\[ne pas importer\]')]
actor = actor[~actor['concat_standard_name'].str.contains('DOUBLON')]
actor = actor[~actor['concat_standard_name'].str.contains('Doublon')]
actor = actor[~actor['concat_standard_name'].str.contains('réutiliser')]
actor = actor[~actor['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(actor)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 61556
Rows number after filter: 59526 (2030 have been removed)


## Filter by Actor type

For now we are interested only in persons. 

Persons can be found by having the column `fk_abob_type_actor` being 104.

In [6]:
not104 = actor[actor['fk_abob_type_actor'] != 104]
print(f'Number of not 104 actors: {len(not104)}\n')

display(not104)

actor = actor[actor['fk_abob_type_actor'] == 104]

actor.drop(columns=['fk_abob_type_actor'], inplace=True)

Number of not 104 actors: 3



Unnamed: 0,pk_actor,concat_actr,concat_standard_name,begin_year,certainty_begin,notes_begin,end_year,certainty_end,notes_end,gender_iso,notes,fk_abob_type_actor,creator,creation_time,modifier,modification_time
10340,59031,Actr59031,"Forster, James",1830.0,3,3.0,1930.0,3.0,3.0,1,,106.0,81.0,2016-11-29 11:05:00.060,81.0,2016-11-29 11:05:00
28940,60660,Actr60660,"Valjean, Jean",1769.0,1,,1833.0,1.0,,1,,106.0,122.0,2018-10-23 16:48:50.050,122.0,2018-10-23 16:48:50
46002,46914,Actr46914,Dieu (conception chrétienne),,1,,,,,0,,106.0,3.0,2013-07-04 11:43:15.990,3.0,2013-12-18 15:24:16


## Remove binary characters

In [7]:
actor.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor.replace({'\r\n':''}, regex=True, inplace=True)
actor.replace({'<p>':''}, regex=True, inplace=True)
actor.replace({'</p>':''}, regex=True, inplace=True)

## Discovery

In [8]:
a.discover(actor, uniq_ex_nb=5)

Columns contain:
Total number of rows: 59523
  -             "pk_actor":   0.00% empty - 59523 (100.00%) uniques (eg: 44895; 47015; 47190; 47578; 47630)
  -          "concat_actr":   0.00% empty - 59523 (100.00%) uniques (eg: Actr44895; Actr47015; Actr47190; Actr47578; Actr47630)
  - "concat_standard_name":   0.00% empty - 56542 ( 94.99%) uniques (eg: Sainte-Mar...; Costantino...; Duimio , A...; Zampa, Ang...; Isaresi, P...)
  -           "gender_iso":   0.00% empty -     3 (  0.01%) uniques (eg: 1; 2; 0)
  -        "creation_time":   0.00% empty - 34441 ( 57.86%) uniques (eg: 2012-04-08...; 2013-07-26...; 2013-10-18...; 2013-11-18...; 2013-11-24...)
  -    "modification_time":   0.00% empty - 13973 ( 23.47%) uniques (eg: 2013-12-18...; 2016-10-21...; 2014-09-12...; 2016-10-21...; 2016-10-21...)
  -              "creator":   0.01% empty -    88 (  0.15%) uniques (eg: 43.0; 30.0; 27.0; 2.0; 3.0)
  -             "modifier":   8.92% empty -    85 (  0.14%) uniques (eg: 2.0; 30.0; 50.0; 3.

## Handle data

In [9]:
# ISO mentions, gender should be 0, 1, 2 or 9.
actor['gender_iso'].replace(pd.NA, 0, inplace=True)

# Not filled certainty should be 0
actor['certainty_begin'].replace(pd.NA, '0', inplace=True)
actor['certainty_end'].replace(' ', pd.NA, inplace=True)
actor['certainty_end'].replace(pd.NA, '0', inplace=True)

When importing FileMaker database into BHP, date certainty has been set to 1 by default. So When date is missing and certainty is at 1, it should actually be 0

In [10]:
for i, row in actor.iterrows():
    if pd.isna(row['begin_year']): actor.at[i, 'certainty_begin'] = '0'
    if pd.isna(row['end_year']): actor.at[i, 'certainty_end'] = '0'

When dates certainty is equal to 3, we can remove those dates: they have been postulated (cf [comment on issue](https://github.com/geovistory/project-symogih/issues/1#issuecomment-1485216923)).

In [11]:
for i, row in actor.iterrows():
    if row['certainty_begin'] == 3: 
        actor.at[i, 'begin_year'] = pd.NA
        actor.at[i, 'certainty_begin'] = 0
    if row['certainty_end'] == 3: 
        actor.at[i, 'end_year'] = pd.NA
        actor.at[i, 'certainty_end'] = 0

## Type parsing

In [12]:
actor['certainty_end'].replace(' ', None, inplace=True)
a.set_types(actor, {
               "pk_actor": 'int', 
      "modification_time": 'datetime',
          "creation_time": 'datetime',
   "concat_standard_name": 'string',
            "concat_actr": 'string',
                "creator": 'int',
             "gender_iso": 'string',
               "modifier": 'int',
        "certainty_begin": 'int',
          "certainty_end": 'int',
             "begin_year": 'int',
               "end_year": 'int',
            "notes_begin": 'int',
              "notes_end": 'int',
                  "notes": 'string'
})

## Save data

In [13]:
u.save_df(actor, actor_path)