In [1]:
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a

bhp_path = '../../../data/bhp'

# Fetch data from BHP (from cleaned filed)

In [2]:
actor = u.read_df(f'{bhp_path}/actor.csv')

a.set_types(actor, {
               "pk_actor": 'int', 
      "modification_time": 'datetime',
          "creation_time": 'datetime',
   "concat_standard_name": 'string',
            "concat_actr": 'string',
                "creator": 'int',
             "gender_iso": 'int',
               "modifier": 'int',
        "certainty_begin": 'int',
          "certainty_end": 'int',
             "begin_year": 'int',
               "end_year": 'int',
            "notes_begin": 'int',
              "notes_end": 'int',
                  "notes": 'string'
})

In [3]:
actor_name = u.read_df(f'{bhp_path}/actor_name.csv')

a.set_types(actor_name, {
     "pk_actor_name": 'int',
       "concat_acna": 'string',
     "creation_time": 'datetime',
  "is_standard_name": 'bool',
          "fk_actor": 'int',
       "concat_name": 'string',
           "creator": 'int',
              "name": 'string',
          "lang_iso": 'string',
          "modifier": 'int',
        "first_name": 'string',
 "modification_time": 'datetime',
 "fk_abob_name_type": 'int',
             "notes": 'string',
"comment_begin_year": 'string',
  "comment_end_year": 'string',
        "apposition": 'string',
       "preposition": 'string',
          "particle": 'string',
             "title": 'string',
        "begin_date": 'string',
          "end_date": 'string',
      "ordinal_text": 'string',
       "ordinal_num": 'int',
})

  return pd.read_csv(path, sep=';', quoting=2)


In [4]:
actor_text_property = u.read_df(f'{bhp_path}/actor_text_property.csv')

a.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Create Person table

In [5]:
# Names
names = pd.DataFrame()
names['pk'] = actor_name['fk_actor']
names['name'] = actor_name['first_name'] + ' ' + actor_name['name']
names['name'] = names['name'].str.lower()
names['name'] = names['name'].str.strip()
names['first_name'] = actor_name['first_name'].str.lower().str.strip()
names['last_name'] = actor_name['name'].str.lower().str.strip()
names.dropna(inplace=True)

print('Number of names available:', len(names))

Number of names available: 59711


In [6]:
def set_gender(code):
    """Given the ISO code of a gender, return the right string."""

    if code == 0: return pd.NA
    if code == 1: return 'Male'
    if code == 2: return 'Female'
    raise ValueError(f'Gender has code <{code}>')


# Gender
genders = pd.DataFrame()
genders['pk'] = actor['pk_actor']
genders['gender'] = [set_gender(code) for code in actor['gender_iso']]
genders.dropna(inplace=True)

print('Number of genders available:', len(genders))

Number of genders available: 57753


In [7]:
actor_text_property

Unnamed: 0,pk_actor_text_property,property_type,lang_iso_code,text,notes,fk_actor,creator,modifier,creation_time,modification_time,concat_actp
0,29364,notice,fra,Directeur des carrires de la Maison Civet Crou...,,47735,2,2,2013-12-19 10:39:22.650,,AcTP29364
1,29366,notice,fra,Conseiller technique des affaires religieuses ...,,47736,50,50,2013-12-19 15:29:27.680,2013-12-19 16:23:25,AcTP29366
2,17991,notice,fra,Il a t propritaire d'une manufacture de coton ...,,40250,2,2,2010-11-18 11:15:05.000,2013-12-19 16:33:40,AcTP17991
3,29369,notice,fra,Marquis de Galatula et grand chancelier du Roy...,,3221,3,3,2013-12-26 18:32:25.160,,AcTP29369
4,29368,notice,fra,Thologien et abb bndictin,,47754,30,30,2013-12-26 12:46:43.630,2013-12-26 20:25:12,AcTP29368
...,...,...,...,...,...,...,...,...,...,...,...
53882,78376,notice,fra,Religieuse professe du Carmel de la Mre de Die...,,63566,1,1,2022-07-07 12:16:51.020,,AcTP78376
53883,78400,notice,fra,Religieuse professe converse du Carmel de la M...,,63590,1,1,2022-07-07 13:44:11.050,,AcTP78400
53884,78413,notice,fra,Religieuse professe du Carmel de l'Assomption ...,,63603,1,1,2022-10-10 12:10:13.170,,AcTP78413
53885,78418,notice,fra,Religieuse professe du Carmel de l'Assomption ...,,63608,1,1,2022-10-10 13:35:33.390,,AcTP78418


In [8]:
# Definitions
definitions = pd.DataFrame()
definitions['pk'] = actor_text_property['fk_actor']
definitions['definition'] = actor_text_property['text']
definitions['definition_lang'] = actor_text_property['lang_iso_code'] 
definitions.dropna(inplace=True)

print('Number of definitions available:', len(definitions))
print('Number of unique entities having at least one definition:', len(definitions['pk'].unique()))

Number of definitions available: 52383
Number of unique entities having at least one definition: 44784


In [9]:
# Births
births = pd.DataFrame()
births['pk'] = actor['pk_actor']
births['birth_year'] = actor['begin_year']
births.dropna(inplace=True)

print('Number of births available:', len(births))

Number of births available: 34657


In [10]:
births_certainty = pd.DataFrame()
births_certainty['pk'] = actor['pk_actor']
births_certainty['certainty_birth'] = actor['certainty_begin']
births_certainty.dropna(inplace=True)

print('Number of births certainty available:', len(births_certainty))

Number of births certainty available: 59523


In [11]:
# Deaths
deaths = pd.DataFrame()
deaths['pk'] = actor['pk_actor']
deaths['death_year'] = actor['end_year']
deaths.dropna(inplace=True)

print('Number of deaths available:', len(deaths))

Number of deaths available: 26205


In [12]:
deaths_certainty = pd.DataFrame()
deaths_certainty['pk'] = actor['pk_actor']
deaths_certainty['certainty_death'] = actor['certainty_begin']
deaths_certainty.dropna(inplace=True)

print('Number of death certainty available:', len(deaths_certainty))

Number of death certainty available: 59523


In [13]:
# Merging all together

bhp_actors = pd.DataFrame()
bhp_actors['pk'] = actor['pk_actor']
bhp_actors = bhp_actors.merge(names, on='pk', how='left')
bhp_actors = bhp_actors.merge(definitions, on='pk', how='left')
bhp_actors = bhp_actors.merge(genders, on='pk', how='left')
bhp_actors = bhp_actors.merge(births, on='pk', how='left')
bhp_actors = bhp_actors.merge(births_certainty, on='pk', how='left')
bhp_actors = bhp_actors.merge(deaths, on='pk', how='left')
bhp_actors = bhp_actors.merge(deaths_certainty, on='pk', how='left')
bhp_actors.dropna(subset=['name', 'gender', 'birth_year', 'death_year'], how='all', inplace=True)
bhp_actors.drop_duplicates(inplace=True)

u.infos(bhp_actors, random=True)

Shape:  (70193, 11) - extract:


Unnamed: 0,pk,name,first_name,last_name,definition,definition_lang,gender,birth_year,certainty_birth,death_year,certainty_death
10271,895,claude dupuy,claude,dupuy,,,,1545.0,1,1594.0,1
50353,42072,gaston demoulin,gaston,demoulin,"Diplm des Arts et Mtiers (Chlons 1895), Gaston...",fra,Male,1879.0,2,1945.0,2
62243,61059,lon chevalier,lon,chevalier,Administrateur au conseil d'administration de ...,fra,Male,,0,,0
24691,56335,"etienne, adolphe bartin","etienne, adolphe",bartin,NR,fra,Male,1860.0,1,1948.0,1
30172,55654,christoph gudermann,christoph,gudermann,Mathematiker,deu,Male,1798.0,0,1852.0,0


In [14]:
u.save_df(bhp_actors, '../../../data/prepared/bhp_actors.csv')