# Merge Open Sanctions and LittleSis datasets into spaCy KB input file  

In [1]:
import pandas as pd
import pickle
import numpy as np
import re 
import datetime as dt

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', None)

In [None]:
def url_generator(id_,name, dataset):
    """
    Generate KB urls
    """
    if dataset=='open_sanctions':
        return f'https://www.opensanctions.org/entities/{id_}'
    if dataset=='lilsis':
        return f'https://littlesis.org/person/{id_}-{name}'

## Import individual datasets 

In [None]:
# Read both datasets
os_kb_entities=pd.read_csv(f'../assets/open_sanctions_entities.csv',index_col=0)
os_kb_entities['kb_origin']='open_sanctions'
ls_kb_entities=pd.read_csv(f'../assets/lilsis_entities.csv',index_col=0)
ls_kb_entities['kb_origin']='lilsis'

In [5]:
# Rename columns to concat datasets  
os_kb_entities=os_kb_entities.rename(columns={'full_notes':'desc'})
ls_kb_entities=ls_kb_entities.rename(columns={'primary_ext':'schema','end_date':'deathdate','aliases':'AKA','start_date':'birthdate','context':'desc'})

In [6]:
# Concat datasets into one
kb_entities=pd.concat([os_kb_entities,ls_kb_entities]).reset_index().rename(columns={'index':'original_index'})

In [7]:
# Drop useless columns
kb_entities.drop(['schema','notes','context','types','start_date_sentence','end_date_sentence'],1,inplace=True)

  


In [8]:
kb_entities.shape

(503310, 11)

In [9]:
# Fix trailing whitespaces
kb_entities['desc']=kb_entities['desc'].apply(lambda x: re.sub(r"\b(\.)[\.\s]+$", "\\1", x))

In [10]:
# Remove entities with no description
kb_entities=kb_entities[kb_entities['desc'].str.replace(' ','').apply(len)>0]

In [11]:
# Drop duplicated based on same name and description
kb_entities.drop_duplicates(subset=['name','desc'],inplace=True)

In [12]:
kb_entities.shape

(429953, 11)

In [13]:
kb_entities.head(2)

Unnamed: 0,original_index,id,name,AKA,birthdate,deathdate,wikidataId,website,desc,kb_origin,birthplace
0,0,acf-00040861bc3f593000830d987d09967ef3503ef1,Kolyvanov Egor,,1980-11-15,,,,"Kolyvanov Egor is a Russian propagandist: host of news program ""Segodnia"" (""Today"") on NTV. This person has held these positions: Federal media employees, Propagandists. This person is a male. This person was born in 1980-11-15.",open_sanctions,
1,1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Shipov Sergei Yurievich,,1966-04-17,,,,"Shipov Sergei Yurievich is a Russian chess player, grandmaster, chess coach, commentator. Publicly supported Russia's war against Ukraine.. This person has held these positions: Athletes, Warmongers. This person is a male. This person was born in 1966-04-17.",open_sanctions,


## Resolve duplicate entitiy IDs

In [14]:
# Find duplicate entries on the 'id' columns
redundant_entities_by_id=kb_entities[kb_entities['id'].duplicated(keep=False)].sort_values(['id','name'])
# Drop duplicate_entities (these were all cases where the ID was taken from Wikidata)
redundant_entities_indices=redundant_entities_by_id.index
kb_entities.drop(redundant_entities_indices, inplace=True)
# Keep first of duplicated entities
redundant_entities_by_id_consolidated_desc=redundant_entities_by_id.groupby(['id']).first().reset_index()
# Concatenate back to kb entity dataframe 
kb_entities=pd.concat([kb_entities,redundant_entities_by_id_consolidated_desc])
kb_entities.reset_index(drop=True,inplace=True)

In [15]:
kb_entities.shape

(429861, 11)

In [16]:
# Raise exception if duplicated IDs are found in the dataset
if (kb_entities['id'].duplicated().any()) | (kb_entities['id'].nunique()!=kb_entities.shape[0]):
    raise Exception('There might be duplicate entitiy IDs in the KB file.')

## Resolve duplicates on birthdate, deathdate and website

In [17]:
# create auxiliary column for ordering based on len of description field
kb_entities['desc_len']=kb_entities['desc'].str.len()
# handpicked columns providing contextual identity clues
redundancy_cols=['birthdate','deathdate','website']
for col in redundancy_cols:
    # Find duplicates ordered by description len
    redundant_entities_by_col=kb_entities[~(kb_entities[col].isna())&
                (kb_entities.duplicated(['name',col],keep=False))
               ].sort_values(by=['name','desc_len'],ascending=False)

    # Drop duplicate_entities on name and birthdate
    redundant_entities_indices=redundant_entities_by_col.index
    kb_entities.drop(redundant_entities_indices, inplace=True)

    # Keep first of duplicated entities
    redundant_entities_by_col_consolidated_desc=redundant_entities_by_col.groupby(['name',col]).first().reset_index()

    # Concatenate back to kb entity dataframe 
    kb_entities=pd.concat([kb_entities,redundant_entities_by_col_consolidated_desc])
    kb_entities.reset_index(drop=True,inplace=True)

In [18]:
kb_entities.shape

(428519, 12)

## Further wrangling

In [19]:
# Duplicates on name
kb_duplicated=kb_entities[kb_entities.duplicated(keep=False,subset=['name'])].sort_values(by=['name'])

In [20]:
kb_entities.shape

(428519, 12)

In [21]:
# Standardise names 
kb_entities['name']=kb_entities['name'].str.title()

In [22]:
# Ensure person name is included in all descriptions
name_not_in_notes_indices=kb_entities[kb_entities.apply(lambda x: x['name'].lower() not in x['desc'].lower(),axis=1)].index.values
naming_string = 'This person is called '
kb_entities.loc[name_not_in_notes_indices,'desc'] = kb_entities.loc[name_not_in_notes_indices].apply(lambda x: naming_string + x['name'] + '. ' + x['desc'],axis=1)

In [23]:
# Ensure descriptions end in stop mark
kb_entities.loc[~kb_entities['desc'].isna(),'desc']=kb_entities.loc[~kb_entities['desc'].isna(),'desc'].apply(lambda x: x + '.' if x[-1]!='.' else x)

In [24]:
# Clean up data by removing multiple trailing stop marks
multi_stopmarks_expr = re.compile('\.\s?\.')

for expr in [multi_stopmarks_expr]:
    # Replace expression in string
    kb_entities.loc[~kb_entities['desc'].isna(),'desc']=kb_entities.loc[~kb_entities['desc'].isna(),'desc'].apply(lambda x: ''.join([re.sub(expr, '. ', x)]))

In [25]:
# Add KB URLs
kb_entities['kb_url']=kb_entities.apply(
    lambda x: [url_generator(x['id'],x['name'], 'open_sanctions') 
                             if x['kb_origin']=='open_sanctions' 
                             else url_generator(x['id'],x['name'], 'lilsis') 
                                                  ][0],1
)

## Export KB dataset

In [26]:
dataset='full'
kb_iteration=dt.datetime.now().strftime('%Y_%m_%d')
dataset=f'{dataset}_{kb_iteration}'
kb_entities.to_csv(f'../assets/kb_entities_{dataset}.csv')