In [1]:
actor_name_path = '../../data/bhp/actor_name.csv'

In [2]:
import os
import pandas as pd
import datetime

import geovpylib.utils as u
import geovpylib.analysis as a
import geovpylib.database as db

db.connect_external(os.environ.get('YELLOW_BHP'))
eta = u.Eta()

>> Connecting to PGSQL Database ... Connected!


## Fetch full table

In [3]:
actor_name = db.query('select * from bhp.actor_name')

## Filter unwanted columns

According to the wiki page, we can get rid of those columns:
- `name_type`
- `name_number`

In [4]:
actor_name.drop(columns=['name_type', 'name_number'], inplace=True)

## Remove binary characters

In [5]:
actor_name.replace({'\r\n':''}, regex=True, inplace=True)
actor_name.replace({'<p>':''}, regex=True, inplace=True)
actor_name.replace({'</p>':''}, regex=True, inplace=True)

u.remove_binary_chars(actor_name)

## Discovery

In [6]:
a.discover(actor_name, uniq_ex_nb=3)

Columns contain:
Total number of rows: 67293
  -      "pk_actor_name":   0.00% empty - 67293 (100.00%) uniques (eg: 49829; 49830; 49832)
  -        "concat_name":   0.00% empty - 63640 ( 94.57%) uniques (eg: Otte, Bern...; Staud, Joh...; Roma, Giul...)
  -      "creation_time":   0.00% empty - 40469 ( 60.14%) uniques (eg: 2013-02-20...; 2013-02-20...; 2013-02-20...)
  -           "fk_actor":   0.00% empty - 61555 ( 91.47%) uniques (eg: 46706; 46707; 46709)
  -              "notes":   0.00% empty -   420 (  0.62%) uniques (eg: None; ; Se fait ap...)
  -   "comment_end_year":   0.00% empty -    12 (  0.02%) uniques (eg: None; ; Nom parfoi...)
  -         "apposition":   0.00% empty -  1892 (  2.81%) uniques (eg: None; Acquanegra; Loyola)
  -        "preposition":   0.00% empty -    37 (  0.05%) uniques (eg: None; dit de; de)
  -              "title":   0.00% empty -   229 (  0.34%) uniques (eg: None; d'; de)
  - "comment_begin_year":   0.00% empty -    25 (  0.04%) uniques (eg: None; ; E

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [7]:
a.set_types(actor_name, {
     "pk_actor_name": 'int',
       "concat_acna": 'string',
     "creation_time": 'datetime',
  "is_standard_name": 'bool',
          "fk_actor": 'int',
       "concat_name": 'string',
           "creator": 'int',
              "name": 'string',
          "lang_iso": 'string',
          "modifier": 'int',
        "first_name": 'string',
 "modification_time": 'datetime',
 "fk_abob_name_type": 'int',
             "notes": 'string',
"comment_begin_year": 'string',
  "comment_end_year": 'string',
        "apposition": 'string',
       "preposition": 'string',
          "particle": 'string',
             "title": 'string',
        "begin_year": 'int',
          "end_year": 'int',
      "ordinal_text": 'string',
       "ordinal_num": 'int',
       "begin_month": 'int',
         "begin_day": 'int',
         "end_month": 'int',
           "end_day": 'int',
})     

# So that they appear correctly
a.set_types(actor_name, {
        "begin_year": 'string',
          "end_year": 'string',
       "begin_month": 'string',
         "end_month": 'string',
         "begin_day": 'string',
           "end_day": 'string',
})     

## Handle data

### begin and end dates

We create 2 new columns, made of the joining of `begin_year`, `begin_month`, `begin_day` and `end_year`, `end_month`, `end_day`.

In [8]:
def prefix_date(date):
    if pd.isna(date): return date
    if len(str(date)) == 3: return f'0{date}'
    return date 

# Set the length of begin_year and end_year to 4
actor_name['begin_year'] = [prefix_date(d) for d in actor_name['begin_year']]
actor_name['end_year'] = [prefix_date(d) for d in actor_name['end_year']]

# Replace existing 6 columns by 2
actor_name['begin_date'] = actor_name['begin_year'] + actor_name['begin_month'] + actor_name['begin_day']
actor_name['end_date'] = actor_name['end_year'] + actor_name['end_month'] + actor_name['end_day']
actor_name.drop(columns=['begin_year', 'begin_month', 'begin_day', 'end_year', 'end_month', 'end_day'], inplace=True)

# Parse into datetime
actor_name['begin_date'] = [datetime.datetime.strptime(d, '%Y%m%d') if pd.notna(d) else pd.NaT for d in actor_name['begin_date']]
actor_name['end_date'] = [datetime.datetime.strptime(d, '%Y%m%d') if pd.notna(d) else pd.NaT for d in actor_name['end_date']]

### lang_iso

Some cleaning is made on this column, in order to fit ISO639-2/T (3 letters code, native prefered, eg 'deu' instead of 'ger').

In [9]:
actor_name['lang_iso'].replace('   ', pd.NA, inplace=True)
actor_name['lang_iso'].replace('fr ', 'fra', inplace=True)
actor_name['lang_iso'].replace('Fr ', 'fra', inplace=True)
actor_name['lang_iso'].replace('FRA', 'fra', inplace=True)
actor_name['lang_iso'].replace('ang', 'eng', inplace=True)
actor_name['lang_iso'].replace('gre', 'ell', inplace=True)
actor_name['lang_iso'].replace('gal', 'gla', inplace=True)

## Save data

In [10]:
u.save_df(actor_name, actor_name_path)