# Exploratory Analysis of table `actor_name`

In [None]:
# Python lib
import os
import pandas as pd
import datetime

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']


# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

actor_name = tk.db_execute('select * from bhp.actor_name')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:
- `name_type`
- `name_number`

In [None]:
actor_name.drop(columns=['name_type', 'name_number'], inplace=True)

## Table extract

In [None]:
actor_name.sample(5)

## Discovery

In [None]:
tk.discover(actor_name, uniq_ex_nb=3)

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [None]:
tk.set_types(actor_name, {
     "pk_actor_name": 'int',
       "concat_acna": 'string',
     "creation_time": 'datetime',
  "is_standard_name": 'bool',
          "fk_actor": 'int',
       "concat_name": 'string',
           "creator": 'int',
              "name": 'string',
          "lang_iso": 'string',
          "modifier": 'int',
        "first_name": 'string',
 "modification_time": 'datetime',
 "fk_abob_name_type": 'int',
             "notes": 'string',
"comment_begin_year": 'string',
  "comment_end_year": 'string',
        "apposition": 'string',
       "preposition": 'string',
          "particle": 'string',
             "title": 'string',
        "begin_year": 'int',
          "end_year": 'int',
      "ordinal_text": 'string',
       "ordinal_num": 'int',
       "begin_month": 'int',
         "begin_day": 'int',
         "end_month": 'int',
           "end_day": 'int',
})     

# So that they appear correctly
tk.set_types(actor_name, {
        "begin_year": 'string',
          "end_year": 'string',
       "begin_month": 'string',
         "end_month": 'string',
         "begin_day": 'string',
           "end_day": 'string',
})     

## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some of the column, we will update their value.

### begin_date & end_date

We create 2 new columns, made of the joining of `begin_year`, `begin_month`, `begin_day` and `end_year`, `end_month`, `end_day`.

In [None]:
def prefix_date(date):
    if pd.isna(date): return date
    if len(str(date)) == 3: return f'0{date}'
    return date 

# Set the length of begin_year and end_year to 4
actor_name['begin_year'] = [prefix_date(d) for d in actor_name['begin_year']]
actor_name['end_year'] = [prefix_date(d) for d in actor_name['end_year']]

actor_name['begin_date'] = actor_name['begin_year'] + actor_name['begin_month'] + actor_name['begin_day']
actor_name['end_date'] = actor_name['end_year'] + actor_name['end_month'] + actor_name['end_day']
actor_name.drop(columns=['begin_year', 'begin_month', 'begin_day', 'end_year', 'end_month', 'end_day'], inplace=True)

# Parse into datetime
actor_name['begin_date'] = [datetime.datetime.strptime(d, '%Y%m%d') if pd.notna(d) else pd.NaT for d in actor_name['begin_date']]
actor_name['end_date'] = [datetime.datetime.strptime(d, '%Y%m%d') if pd.notna(d) else pd.NaT for d in actor_name['end_date']]

### creation_time 

In [None]:
px.violin(actor_name, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [None]:
tk.histogram(actor_name, 'creator', 'Creator distribution', 10)

### lang_iso

Some cleaning is made on this column, in order to fit ISO639-2/T (3 letters code, native prefered, eg 'deu' instead of 'ger').

In [None]:
actor_name['lang_iso'].replace('   ', pd.NA, inplace=True)
actor_name['lang_iso'].replace('fr ', 'fra', inplace=True)
actor_name['lang_iso'].replace('Fr ', 'fra', inplace=True)
actor_name['lang_iso'].replace('FRA', 'fra', inplace=True)
actor_name['lang_iso'].replace('ang', 'eng', inplace=True)
actor_name['lang_iso'].replace('gre', 'ell', inplace=True)
actor_name['lang_iso'].replace('gal', '', inplace=True) #?????

In [None]:
tk.histogram(actor_name, 'lang_iso', 'Language distribution', style='pie', colors=gv_color_seq)

### notes

All HTML tags, non ASCII chars and new line are removed.

In [None]:
actor_name.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_name.replace({'\r\n':''}, regex=True, inplace=True)
actor_name.replace({'<p>':''}, regex=True, inplace=True)
actor_name.replace({'</p>':''}, regex=True, inplace=True)

---

In [None]:
actor_name.to_csv('../../data/actor_name.csv', index=False, sep=';')