In [1]:
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a

# Fetch data from BHP (from cleaned filed)

In [2]:
actor = pd.read_csv('../../data/actor.csv', sep=';')

a.set_types(actor, {
               "pk_actor": 'int', 
      "modification_time": 'datetime',
          "creation_time": 'datetime',
   "concat_standard_name": 'string',
            "concat_actr": 'string',
                "creator": 'int',
             "gender_iso": 'int',
               "modifier": 'int',
        "certainty_begin": 'int',
          "certainty_end": 'int',
             "begin_year": 'int',
               "end_year": 'int',
            "notes_begin": 'int',
              "notes_end": 'int',
                  "notes": 'string'
})

In [3]:
actor_name = pd.read_csv('../../data/actor_name.csv', sep=';', low_memory=False)

a.set_types(actor_name, {
     "pk_actor_name": 'int',
       "concat_acna": 'string',
     "creation_time": 'datetime',
  "is_standard_name": 'bool',
          "fk_actor": 'int',
       "concat_name": 'string',
           "creator": 'int',
              "name": 'string',
          "lang_iso": 'string',
          "modifier": 'int',
        "first_name": 'string',
 "modification_time": 'datetime',
 "fk_abob_name_type": 'int',
             "notes": 'string',
"comment_begin_year": 'string',
  "comment_end_year": 'string',
        "apposition": 'string',
       "preposition": 'string',
          "particle": 'string',
             "title": 'string',
        "begin_date": 'string',
          "end_date": 'string',
      "ordinal_text": 'string',
       "ordinal_num": 'int',
})

In [4]:
actor_text_property = pd.read_csv('../../data/actor_text_property.csv', sep=';', low_memory=False)

a.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Create Person table

In [5]:
# Names
names = pd.DataFrame()
names['pk'] = actor_name['fk_actor']
names['name'] = actor_name['first_name'] + ' ' + actor_name['name']
names['name'] = names['name'].str.lower()
names['name'] = names['name'].str.strip()
names['first_name'] = actor_name['first_name'].str.lower().str.strip()
names['last_name'] = actor_name['name'].str.lower().str.strip()
names.dropna(inplace=True)

print('Number of names available:', len(names))

Number of names available: 59711


In [6]:
def set_gender(code):
    """Given the ISO code of a gender, return the right string."""

    if code == 0: return pd.NA
    if code == 1: return 'Male'
    if code == 2: return 'Female'
    raise ValueError(f'Gender has code <{code}>')


# Gender
genders = pd.DataFrame()
genders['pk'] = actor['pk_actor']
genders['gender'] = [set_gender(code) for code in actor['gender_iso']]
genders.dropna(inplace=True)

print('Number of genders available:', len(genders))

Number of genders available: 57753


In [7]:
# Births
births = pd.DataFrame()
births['pk'] = actor['pk_actor']
births['birth_year'] = actor['begin_year']
births.dropna(inplace=True)

print('Number of births available:', len(births))

Number of births available: 48476


In [8]:
births_certainty = pd.DataFrame()
births_certainty['pk'] = actor['pk_actor']
births_certainty['certainty_birth'] = actor['certainty_begin']
births_certainty.dropna(inplace=True)

print('Number of births certainty available:', len(births_certainty))

Number of births certainty available: 59523


In [9]:
# Deaths
deaths = pd.DataFrame()
deaths['pk'] = actor['pk_actor']
deaths['death_year'] = actor['end_year']
deaths.dropna(inplace=True)

print('Number of deaths available:', len(deaths))

Number of deaths available: 29366


In [10]:
deaths_certainty = pd.DataFrame()
deaths_certainty['pk'] = actor['pk_actor']
deaths_certainty['certainty_death'] = actor['certainty_begin']
deaths_certainty.dropna(inplace=True)

print('Number of death certainty available:', len(deaths_certainty))

Number of death certainty available: 59523


In [11]:
# Merging all together

persons_bhp = pd.DataFrame()
persons_bhp['pk'] = actor['pk_actor']
persons_bhp = persons_bhp.merge(names, on='pk', how='left')
persons_bhp = persons_bhp.merge(genders, on='pk', how='left')
persons_bhp = persons_bhp.merge(births, on='pk', how='left')
persons_bhp = persons_bhp.merge(births_certainty, on='pk', how='left')
persons_bhp = persons_bhp.merge(deaths, on='pk', how='left')
persons_bhp = persons_bhp.merge(deaths_certainty, on='pk', how='left')
persons_bhp['dataset'] = 'bhp'
persons_bhp.dropna(subset=['name', 'gender', 'birth_year', 'death_year'], how='all', inplace=True)
persons_bhp.drop_duplicates(inplace=True)

u.infos(persons_bhp, random=True)

Shape:  (62527, 10)


Unnamed: 0,pk,name,first_name,last_name,gender,birth_year,certainty_birth,death_year,certainty_death,dataset
39649,42691,gustave deval,gustave,deval,Male,1853.0,1,,1,bhp
61586,60109,,,,Female,1650.0,2,1731.0,2,bhp
41617,6359,andr pascal,andr,pascal,Male,1901.0,1,,1,bhp
40373,43622,,,,Male,,0,,0,bhp
23024,55721,joseph friedrich bierbrauer,joseph friedrich,bierbrauer,Male,1694.0,0,,0,bhp


In [12]:
u.save_df(persons_bhp, '../../data/persons-bhp.csv')