# Exploratory Analysis of table `actor`

In [None]:
# Python lib
import os
import pandas as pd
import csv

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']

# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

# Fetch data
actor = tk.db_execute('select * from bhp.actor')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:
- `standard_text_property`
- `count_text_property`
- `concat_names`

In [None]:
actor.drop(columns=['standard_text_property', 'count_text_property', 'concat_names'], inplace=True)

## Table extract

In [None]:
actor.sample(5)

## Filter only wanted rows

Some of the rows has been identified to not be imported (see this [wiki page](https://github.com/geovistory/symogih/wiki/Liste-des-balises-des-entit%C3%A9s-%C3%A0-ne-pas-importer)).

In [None]:
len_before = len(actor)
print(f'Rows number before filter: {len_before}')

actor = actor[~actor['concat_standard_name'].str.contains('\[à identifier\]')]
actor = actor[~actor['concat_standard_name'].str.contains('\[ne pas importer\]')]
actor = actor[~actor['concat_standard_name'].str.contains('DOUBLON')]
actor = actor[~actor['concat_standard_name'].str.contains('Doublon')]
actor = actor[~actor['concat_standard_name'].str.contains('réutiliser')]
actor = actor[~actor['concat_standard_name'].str.contains('REUTILISER')]

len_after = len(actor)

print(f'Rows number after filter: {len_after} ({len_before - len_after} have been removed)')

## Filter by Actor type

For now we are interested only in persons. 

Persons can be found by having the column `fk_abob_type_actor` being 104.

In [None]:
not104 = actor[actor['fk_abob_type_actor'] != 104]
print(f'Number of not 104 actors: {len(not104)}\n')

display(not104)

actor = actor[actor['fk_abob_type_actor'] == 104]

actor.drop(columns=['fk_abob_type_actor'], inplace=True)

## Discovery

In [None]:
tk.discover(actor, uniq_ex_nb=2)

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [None]:
actor['certainty_end'].replace(' ', None, inplace=True)
tk.set_types(actor, {
               "pk_actor": 'int', 
      "modification_time": 'datetime',
          "creation_time": 'datetime',
   "concat_standard_name": 'string',
            "concat_actr": 'string',
                "creator": 'int',
             "gender_iso": 'string',
               "modifier": 'int',
        "certainty_begin": 'int',
          "certainty_end": 'int',
             "begin_year": 'int',
               "end_year": 'int',
            "notes_begin": 'int',
              "notes_end": 'int',
                  "notes": 'string'
})


## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some columns, we will update their value.

### gender_iso

We observe some of the gender values being undefined. As the ISO mentions, it should be 0, 1, 2 or 9. So we replace the undefined gender by 0.

In [None]:
actor['gender_iso'].replace(pd.NA, '0', inplace=True)

tk.histogram(actor, 'gender_iso', title='Gender distribution', style='bar', colors=gv_color_seq)

### certainty_begin

We replace the not filled values by 0.

In [None]:
actor['certainty_begin'].replace(pd.NA, 0, inplace=True)

tk.histogram(actor, 'certainty_begin', title='Begin certainty distribution', style='pie', colors=gv_color_seq)

### begin_year

In [None]:
px.violin(actor, x='begin_year', color='gender_iso', height=600, title='Violin plot of the "begin_year" column, by gender')

In [None]:
px.violin(actor.sort_values(by='certainty_begin'), x='begin_year', color='certainty_begin', height=600, title='Violin plot of the "begin_year" column, by certainty')

In [None]:
px.violin(actor.sort_values(by='certainty_begin'), x='begin_year', y='gender_iso', color='certainty_begin', height=1000, title='Violin plot of the "begin_year" column, by certainty, for each gender')

### certainty_end

We replace the not filled values by 0.

In [None]:
actor['certainty_end'].replace(pd.NA, 0, inplace=True)

tk.histogram(actor, 'certainty_end', title='End certainty distribution', style='pie', colors=gv_color_seq)

### end_year

In [None]:
px.violin(actor, x='end_year', color='gender_iso', height=600, title='Violin plot of the "end_year" column, by gender')

In [None]:
px.violin(actor.sort_values(by='certainty_end'), x='end_year', color='certainty_end', height=600, title='Violin plot of the "end_year" column, by certainty')

In [None]:
px.violin(actor.sort_values(by='certainty_end'), x='end_year', y='gender_iso', color='certainty_end', height=1000, title='Violin plot of the "end_year" column, by certainty, for each gender')

### creation_time

In [None]:
px.violin(actor, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [None]:
tk.histogram(actor, 'creator', '10 firsts creators distribution', 10)

### notes

All HTML tags, non ASCII chars and new line are removed.

In [None]:
actor.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor.replace({'\r\n':''}, regex=True, inplace=True)
actor.replace({'<p>':''}, regex=True, inplace=True)
actor.replace({'</p>':''}, regex=True, inplace=True)

---

In [None]:
actor.to_csv('../../data/actor.csv', index=False, sep=';', quoting=2)