# Exploratory Analysis of table `actor`

In [1]:
# Python lib
import os
import pandas as pd
import csv

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']

# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

# Fetch data
actor = tk.db_execute('select * from bhp.actor')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:
- `standard_text_property`
- `count_text_property`
- `concat_names`

In [2]:
actor.drop(columns=['standard_text_property', 'count_text_property', 'concat_names'], inplace=True)

## Table extract

In [3]:
actor.sample(5)

Unnamed: 0,pk_actor,concat_actr,concat_standard_name,begin_year,certainty_begin,notes_begin,end_year,certainty_end,notes_end,gender_iso,notes,fk_abob_type_actor,creator,creation_time,modifier,modification_time
56385,61929,Actr61929,"Lenoir, Alfred",1850.0,1,,1920.0,1,,1,,104.0,133.0,2019-01-30 19:07:47.160,133.0,2019-01-30 19:08:32
42367,15629,Actr15629,"Monnier, Pierre [à identifier]",,1,,,1,,1,,104.0,28.0,2008-12-04 16:35:57.000,11.0,2016-03-30 10:24:15
53287,1668,Actr1668,"Zañartu, Juan de",1635.0,3,,1695.0,1,,1,,104.0,27.0,2008-11-09 00:10:48.000,50.0,2016-10-20 11:43:57
24066,30898,Actr30898,"Le Faucheur, Michel",1582.0,1,,1657.0,1,,1,,104.0,9.0,2010-03-30 17:15:41.000,11.0,2013-12-18 15:35:49
11441,2988,Actr2988,"Ortensio, Martino",1605.0,1,,1639.0,1,,1,,104.0,3.0,2008-11-09 00:12:53.000,11.0,2013-12-18 15:35:49


## Filter only wanted rows

Some of the rows has been identified to not be imported (see this [wiki page](https://github.com/geovistory/symogih/wiki/Liste-des-balises-des-entit%C3%A9s-%C3%A0-ne-pas-importer)).

In [4]:
len_before = len(actor)
print(f'Rows number before filter: {len_before}')

actor = actor[~actor['concat_standard_name'].str.contains('\[à identifier\]')]
actor = actor[~actor['concat_standard_name'].str.contains('\[ne pas importer\]')]
actor = actor[~actor['concat_standard_name'].str.contains('DOUBLON')]
actor = actor[~actor['concat_standard_name'].str.contains('Doublon')]
actor = actor[~actor['concat_standard_name'].str.contains('réutiliser')]
actor = actor[~actor['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(actor)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

Rows number before filter: 61556
Rows number after filter: 59526 (2030 have been removed)


## Filter by Actor type

For now we are interested only in persons. 

Persons can be found by having the column `fk_abob_type_actor` being 104.

In [5]:
not104 = actor[actor['fk_abob_type_actor'] != 104]
print(f'Number of not 104 actors: {len(not104)}\n')

display(not104)

actor = actor[actor['fk_abob_type_actor'] == 104]

actor.drop(columns=['fk_abob_type_actor'], inplace=True)

Number of not 104 actors: 3



Unnamed: 0,pk_actor,concat_actr,concat_standard_name,begin_year,certainty_begin,notes_begin,end_year,certainty_end,notes_end,gender_iso,notes,fk_abob_type_actor,creator,creation_time,modifier,modification_time
10340,59031,Actr59031,"Forster, James",1830.0,3,3.0,1930.0,3.0,3.0,1,,106.0,81.0,2016-11-29 11:05:00.060,81.0,2016-11-29 11:05:00
28940,60660,Actr60660,"Valjean, Jean",1769.0,1,,1833.0,1.0,,1,,106.0,122.0,2018-10-23 16:48:50.050,122.0,2018-10-23 16:48:50
46002,46914,Actr46914,Dieu (conception chrétienne),,1,,,,,0,,106.0,3.0,2013-07-04 11:43:15.990,3.0,2013-12-18 15:24:16


## Discovery

In [6]:
tk.discover(actor, uniq_ex_nb=2)

Columns contain:
Total number of rows: 59523
  -             "pk_actor":   0.00% empty - 59523 (100.00%) uniques (eg: 44895; 47015)
  -          "concat_actr":   0.00% empty - 59523 (100.00%) uniques (eg: Actr44895; Actr47015)
  - "concat_standard_name":   0.00% empty - 56550 ( 95.01%) uniques (eg: Sainte-Mar...; Costantino...)
  -           "gender_iso":   0.00% empty -     3 (  0.01%) uniques (eg: 1; 2)
  -        "creation_time":   0.00% empty - 34441 ( 57.86%) uniques (eg: 2012-04-08...; 2013-07-26...)
  -    "modification_time":   0.00% empty - 13973 ( 23.47%) uniques (eg: 2013-12-18...; 2016-10-21...)
  -              "creator":   0.01% empty -    88 (  0.15%) uniques (eg: 43.0; 30.0)
  -             "modifier":   8.92% empty -    85 (  0.14%) uniques (eg: 2.0; 30.0)
  -      "certainty_begin":   9.42% empty -     4 (  0.01%) uniques (eg: 3; 1)
  -        "certainty_end":  14.48% empty -     5 (  0.01%) uniques (eg: 3; None)
  -           "begin_year":  18.56% empty -   847 (  1.

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [7]:
actor['certainty_end'].replace(' ', None, inplace=True)
tk.set_types(actor, {
               "pk_actor": 'int', 
      "modification_time": 'datetime',
          "creation_time": 'datetime',
   "concat_standard_name": 'string',
            "concat_actr": 'string',
                "creator": 'int',
             "gender_iso": 'string',
               "modifier": 'int',
        "certainty_begin": 'int',
          "certainty_end": 'int',
             "begin_year": 'int',
               "end_year": 'int',
            "notes_begin": 'int',
              "notes_end": 'int',
                  "notes": 'string'
})


## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some columns, we will update their value.

### gender_iso

We observe some of the gender values being undefined. As the ISO mentions, it should be 0, 1, 2 or 9. So we replace the undefined gender by 0.

In [8]:
actor['gender_iso'].replace(pd.NA, '0', inplace=True)

# tk.histogram(actor, 'gender_iso', title='Gender distribution', style='bar', colors=gv_color_seq)

### certainty_begin

We replace the not filled values by 0.

In [9]:
actor['certainty_begin'].replace(pd.NA, 0, inplace=True)

# tk.histogram(actor, 'certainty_begin', title='Begin certainty distribution', style='pie', colors=gv_color_seq)

### begin_year

In [10]:
# px.violin(actor, x='begin_year', color='gender_iso', height=600, title='Violin plot of the "begin_year" column, by gender')

In [11]:
# px.violin(actor.sort_values(by='certainty_begin'), x='begin_year', color='certainty_begin', height=600, title='Violin plot of the "begin_year" column, by certainty')

In [12]:
# px.violin(actor.sort_values(by='certainty_begin'), x='begin_year', y='gender_iso', color='certainty_begin', height=1000, title='Violin plot of the "begin_year" column, by certainty, for each gender')

### certainty_end

We replace the not filled values by 0.

In [13]:
actor['certainty_end'].replace(pd.NA, 0, inplace=True)

# tk.histogram(actor, 'certainty_end', title='End certainty distribution', style='pie', colors=gv_color_seq)

### end_year

In [14]:
# px.violin(actor, x='end_year', color='gender_iso', height=600, title='Violin plot of the "end_year" column, by gender')

In [15]:
# px.violin(actor.sort_values(by='certainty_end'), x='end_year', color='certainty_end', height=600, title='Violin plot of the "end_year" column, by certainty')

In [16]:
# px.violin(actor.sort_values(by='certainty_end'), x='end_year', y='gender_iso', color='certainty_end', height=1000, title='Violin plot of the "end_year" column, by certainty, for each gender')

### creation_time

In [17]:
# px.violin(actor, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [18]:
# tk.histogram(actor, 'creator', '10 firsts creators distribution', 10)

### notes

All HTML tags, non ASCII chars and new line are removed.

In [19]:
actor.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor.replace({'\r\n':''}, regex=True, inplace=True)
actor.replace({'<p>':''}, regex=True, inplace=True)
actor.replace({'</p>':''}, regex=True, inplace=True)

---

In [20]:
actor.to_csv('../../data/actor.csv', index=False, sep=';', quoting=2)