In [27]:
import pandas as pd
import numpy as np

import geovpylib.utils as u
import geovpylib.analysis as a

# BHP actor cleansing

The goal of this notebook is tool cleanse the list of prepared BHP actors according to the entity recognition that has been made by the record linkage + the manual look at the data.

## Get data

In [28]:
bhp_actors = u.read_df('../../../data/prepared/bhp_actors.csv')
bhp_actors.drop(columns=['first_name', 'last_name', 'certainty_birth', 'certainty_death'], inplace=True)

# u.infos(bhp_actors)

In [29]:
entity_recognition = u.read_df('../../../reports/bhp_entity_recognition_mp.csv')
entity_recognition = entity_recognition[['pk_l', 'pk_r', 'chosen_one_pk', 'please_import', 'remarks']]

# u.infos(entity_recognition)

## Translate entity recognitions

In [30]:
# We also are going to create a black list file, so that we know which actor has to be deleted
blacklist = []

### "complémentaires" recognitions

Here the goal is to merge the two entities into the master one.

In [31]:
# Here we are going, to do a dataframe for each information, and then left join them all, so that it can be added to the bhp_actors dataframe
names = []
definitions = []
genders = []
birth_years = []
death_years = []

# Select only information we need from the entity recognition file
complementaries = entity_recognition[entity_recognition['remarks'] == 'complémentaires'].copy()
complementaries.rename(columns={'chosen_one_pk': 'keep'}, inplace=True)
complementaries['erase'] = [row['pk_r'] if row['pk_l'] == row['keep'] else row['pk_l'] for _,row in complementaries.iterrows()]
complementaries.drop(columns=['please_import', 'remarks', 'pk_l', 'pk_r'], inplace=True)


# Apply the "complémentaire" logic
for _, row in complementaries.iterrows():
    keep = bhp_actors[bhp_actors['pk'] == row['keep']]
    erase = bhp_actors[bhp_actors['pk'] == row['erase']]

    selection = pd.concat([keep, erase])

    # Names
    for name in selection['name'].unique():
        names.append({'pk': row['keep'], 'name': name})

    # Definition
    for _, row2 in selection[['definition', 'definition_lang']].drop_duplicates().iterrows():
        definitions.append({'pk': row['keep'], 'definition': row2['definition'], 'definition_lang': row2['definition_lang']})

    # Gender
    gender = keep['gender'].unique().tolist()[0]
    if pd.isna(gender): gender = erase['gender'].unique().tolist()[0]
    genders.append({'pk': row['keep'], 'gender': gender})

    # Birth year
    birth_year = keep['birth_year'].iloc[0]
    if pd.isna(birth_year): birth_year = erase['birth_year'].iloc[0]
    birth_years.append({'pk': row['keep'], 'birth_year': birth_year})

    # Death year
    death_year = keep['death_year'].iloc[0]
    if pd.isna(death_year): death_year = erase['death_year'].iloc[0]
    death_years.append({'pk': row['keep'], 'death_year': death_year})
   
    # Black list
    blacklist.append(row['erase'])

# Into dataframes
names = pd.DataFrame(data=names)
definitions = pd.DataFrame(data=definitions)
genders = pd.DataFrame(data=genders)
birth_years = pd.DataFrame(data=birth_years)
death_years = pd.DataFrame(data=death_years)    

### "données pauvres" recognition

Here we are simply goind to blacklist both entities.

In [32]:
# Select only information we need from the entity recognition file
poor_data = entity_recognition[entity_recognition['remarks'] == 'données pauvres ou incohérentes'].copy()
poor_data.rename(columns={'pk_l': 'pk1', 'pk_r': 'pk2'}, inplace=True)
poor_data = poor_data[['pk1', 'pk2']]

blacklist = np.unique(blacklist + poor_data['pk1'].tolist() + poor_data['pk2'].tolist()).tolist()

### "doute" recognition

For these data, actually there is nothing to do: both entities have to be imported as such. Should they be linked via a "has to be merged" property?

### "prénom inconnu" recognition

The same, if we have no first name, entities should be created separatly.

### "triplon louis palandre 63414" recognition

Here we have only one tripple recognition. So in order to save time, we will handle this one manually.

So for the code, it has to be (for both of the entities) has to be blacklisted.

We can do that because we made sure that the only difference between the 3 entities is their definition.

So in the end, the work that would need to be done is to add the definition of entity 63405 and 63384 to Louis Palandre.

### "empty" recognition

For all the other recognition, if we do not have a `chosen_one_pk`, it means that we keep both of the entity. As previously explain, it results in doing nothing here.

However, when there is a `chosen_one_pk`, we have to blacklist the not chosen one, and do nothing with the chosen one.

In [33]:
empty = entity_recognition[pd.isna(entity_recognition['remarks'])]
empty = empty[pd.notna(empty['chosen_one_pk'])]

empty['erase'] = [row['pk_l'] if row['chosen_one_pk'] == row['pk_r'] else row['pk_r'] for _, row in empty.iterrows()]
blacklist = np.unique(blacklist + empty['erase'].tolist())

## Aggregation

### Result from entity recognition

In [34]:
result = pd.DataFrame()
result['pk'] = entity_recognition['chosen_one_pk'].dropna().tolist()

result = result.merge(names, on='pk', how='left')
result = result.merge(definitions, on='pk', how='left')
result = result.merge(genders, on='pk', how='left')
result = result.merge(birth_years, on='pk', how='left')
result = result.merge(death_years, on='pk', how='left')

result.dropna(subset=['name', 'definition', 'definition_lang', 'gender', 'birth_year', 'death_year'], inplace=True)
result.drop_duplicates(inplace=True)
result.reset_index(drop=True, inplace=True)
u.parse_df(result)

result

Unnamed: 0,pk,name,definition,definition_lang,gender,birth_year,death_year
0,22087,septimus andreas fabricius,Mdecin Nuremberg,fra,Male,1641,1705
1,22087,septimus andreas fabricius,Arzt,deu,Male,1641,1705
2,22087,s. a. fabricius,Mdecin Nuremberg,fra,Male,1641,1705
3,22087,s. a. fabricius,Arzt,deu,Male,1641,1705
4,22620,jean deriennes,"*Dieppe 1591, La Flche 5.VI.1662. Jsuite, prof...",fra,Male,1591,1662
...,...,...,...,...,...,...,...
145,60554,louis gabriel escher,Industriel mtallurgiste n le 27 novembre 1819 ...,fra,Male,1819,1887
146,60554,louis gabriel oescher,Industriel mtallurgiste. Associ Louis Charles...,fra,Male,1819,1887
147,60554,louis gabriel oescher,Industriel mtallurgiste n le 27 novembre 1819 ...,fra,Male,1819,1887
148,2291,toms maluenda,"*Jtiva (Valence) 1566, 7.V.1628. Clbre dominic...",fra,Male,1565,1628


### Black list

In [35]:
print('Black list length: ', len(blacklist))

Black list length:  438


## Make BHP actors list clean again

In [36]:
print('Before cleaning shape:', bhp_actors.shape)

Before cleaning shape: (70193, 7)


### Drop from blacklist

In [37]:
bhp_actors = bhp_actors[[pk not in blacklist for pk in bhp_actors['pk']]]

print('After removing blacklist shape:', bhp_actors.shape)

After removing blacklist shape: (69741, 7)


### Drop aggregated entities

In [38]:
bhp_actors = bhp_actors[[pk not in result['pk'].tolist() for pk in bhp_actors['pk']]]

print('After removing aggregated entities shape:', bhp_actors.shape)

After removing aggregated entities shape: (69687, 7)


### Add aggregated entities

In [39]:
bhp_actors = pd.concat([bhp_actors, result])

print('After adding aggregated entities shape:', bhp_actors.shape)

After adding aggregated entities shape: (69837, 7)


## Save file

In [40]:
u.save_df(bhp_actors, '../../../data/prepared/bhp_actors_cleaned.csv')