In [1]:
# Python lib
import os
import pandas as pd

# External lib
import plotly.express as px

# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'))


>> Connecting to PGSQL Database ... Connected!


# Get data

In [2]:
actor = tk.db_execute('select * from bhp.actor')
tk.infos(actor, 2)

Shape:  (61556, 19)


Unnamed: 0,pk_actor,concat_actr,concat_standard_name,begin_year,certainty_begin,notes_begin,end_year,certainty_end,notes_end,gender_iso,notes,fk_abob_type_actor,creator,creation_time,modifier,modification_time,concat_names,standard_text_property,count_text_property
0,44895,Actr44895,"Sainte-Marie Perrin, Antoine",1870.0,3,3,1930.0,3,3,1,<p>Il s'agit probablement d'Antoine Joseph Sai...,104.0,43.0,2012-04-08 01:11:47.600,2.0,2013-12-18 15:35:49,"sainte-marie perrin, antoine Sainte-Marie Perr...",,0
1,47015,Actr47015,Costantino da Carrara,1506.0,1,2,1545.0,3,4,1,,104.0,30.0,2013-07-26 14:08:34.100,30.0,2013-12-18 15:35:49,costantino da carrara Costantino da Carrara,,0


# Format data

In [3]:
# Rename columns
actor.rename(columns={
    'concat_standard_name':'name', 
    'begin_year':'birth_year', 
    'certainty_begin':'birth_year_certainty',
    'notes_begin': 'notes_birth',
    'end_year':'death_year', 
    'certainty_end':'death_year_certainty',
    'notes_end': 'notes_death',
    'concat_actr': 'id',
    'gender_iso': 'gender',
    'fk_abob_type_actor': 'fk_type',
    'creator': 'fk_creator',
    'modifier':'fk_modifier'
}, inplace=True, errors='ignore')

# Column format
actor['birth_year'] = actor['birth_year'].astype(pd.Int64Dtype())
actor['birth_year_certainty'] = actor['birth_year_certainty'].astype(pd.Int64Dtype())
actor['notes_birth'] = actor['notes_birth'].astype(pd.Int64Dtype())
actor['death_year'] = actor['death_year'].astype(pd.Int64Dtype())
actor['death_year_certainty'] = actor['death_year_certainty'].replace(' ', pd.NA).astype(pd.Int64Dtype())
actor['notes_death'] = actor['notes_death'].astype(pd.Int64Dtype())
actor['gender'] = actor['gender'].astype(pd.Int64Dtype())
actor['fk_type'] = actor['fk_type'].astype(pd.Int64Dtype())
actor['fk_creator'] = actor['fk_creator'].astype(pd.Int64Dtype())
actor['creation_time'] = pd.to_datetime(actor['creation_time'])
actor['fk_modifier'] = actor['fk_modifier'].astype(pd.Int64Dtype())
actor['modification_time'] = pd.to_datetime(actor['modification_time'])
actor.fillna(pd.NA, inplace=True)

tk.infos(actor, 2)

Shape:  (61556, 19)


Unnamed: 0,pk_actor,id,name,birth_year,birth_year_certainty,notes_birth,death_year,death_year_certainty,notes_death,gender,notes,fk_type,fk_creator,creation_time,fk_modifier,modification_time,concat_names,standard_text_property,count_text_property
0,44895,Actr44895,"Sainte-Marie Perrin, Antoine",1870,3,3,1930,3,3,1,<p>Il s'agit probablement d'Antoine Joseph Sai...,104,43,2012-04-08 01:11:47.600,2,2013-12-18 15:35:49,"sainte-marie perrin, antoine Sainte-Marie Perr...",,0
1,47015,Actr47015,Costantino da Carrara,1506,1,2,1545,3,4,1,,104,30,2013-07-26 14:08:34.100,30,2013-12-18 15:35:49,costantino da carrara Costantino da Carrara,,0


# Enlever tous les acteurs [à identifier]

In [4]:
actor = actor[~actor['name'].str.contains('\[à identifier\]')].reset_index(drop=True)

tk.infos(actor, 2)

Shape:  (59625, 19)


Unnamed: 0,pk_actor,id,name,birth_year,birth_year_certainty,notes_birth,death_year,death_year_certainty,notes_death,gender,notes,fk_type,fk_creator,creation_time,fk_modifier,modification_time,concat_names,standard_text_property,count_text_property
0,44895,Actr44895,"Sainte-Marie Perrin, Antoine",1870,3,3,1930,3,3,1,<p>Il s'agit probablement d'Antoine Joseph Sai...,104,43,2012-04-08 01:11:47.600,2,2013-12-18 15:35:49,"sainte-marie perrin, antoine Sainte-Marie Perr...",,0
1,47015,Actr47015,Costantino da Carrara,1506,1,2,1545,3,4,1,,104,30,2013-07-26 14:08:34.100,30,2013-12-18 15:35:49,costantino da carrara Costantino da Carrara,,0


# Répartition des types d'acteurs

In [5]:
tk.histogram(actor, 'fk_type', 'Type distribution')

## Détail des acters non 104

In [6]:
actor[actor['fk_type'] == 106]

Unnamed: 0,pk_actor,id,name,birth_year,birth_year_certainty,notes_birth,death_year,death_year_certainty,notes_death,gender,notes,fk_type,fk_creator,creation_time,fk_modifier,modification_time,concat_names,standard_text_property,count_text_property
10331,59031,Actr59031,"Forster, James",1830.0,3,3.0,1930.0,3.0,3.0,1,,106,81,2016-11-29 11:05:00.060,81,2016-11-29 11:05:00,"forster, james Forster, James",<p>Personnage de Jules Verne dans Le Tour du m...,1
28906,60660,Actr60660,"Valjean, Jean",1769.0,1,,1833.0,1.0,,1,,106,122,2018-10-23 16:48:50.050,122,2018-10-23 16:48:50,"valjean, jean Valjean, Jean","<p>Personnage de fiction, héros du roman ""Les ...",1
45842,46914,Actr46914,Dieu (conception chrétienne),,1,,,,,0,,106,3,2013-07-04 11:43:15.990,3,2013-12-18 15:24:16,dieu (conception chretienne) Dieu (conception ...,<p>La divinité dans les religions chétiennes</p>,1


# Analyse des dates de naissances

In [7]:
px.violin(actor, x='birth_year', color='gender', height=700)

# Analyse des dates de mort

In [8]:
px.violin(actor, x='death_year', color='gender', height=700)

# Contribution des créateurs

In [9]:
tk.histogram(actor, 'fk_creator', 'Contribution des créateurs', max_number=10)

# Analyse des dates d'ajout des données

In [10]:
px.violin(actor, x='creation_time')