In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
## Pre-process individual kb dataset

In [16]:
# Read both datasets
os_kb_entities=pd.read_csv(f'kb_entities_open_sanctions.csv',index_col=0)
os_kb_entities['kb_origin']='open_sanctions'
ls_kb_entities=pd.read_csv(f'kb_entities_lilsis.csv',index_col=0)
ls_kb_entities['kb_origin']='lilsis'

In [17]:
# Combine datasets into one
kb_entities=pd.concat([os_kb_entities,ls_kb_entities]).reset_index().rename(columns={'index':'original_index'})

In [18]:
kb_entities.shape

(361048, 6)

In [19]:
kb_entities.head(2)

Unnamed: 0,original_index,id,name,desc,AKA,kb_origin
0,0,acf-00040861bc3f593000830d987d09967ef3503ef1,Kolyvanov Egor,"Russian propagandist: host of news program ""Segodnia"" (""Today"") on NTV Federal 1980-11-15 male employees media Propagandists","['Kolyvanov Egor', 'Колыванов Егор']",open_sanctions
1,1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Shipov Sergei Yurievich,"Russian chess player, grandmaster, chess coach, commentator. Publicly supported Russia's war against Ukraine. 1966-04-17 male Warmongers Athletes","['Shipov Sergei Yurievich', 'Шипов Сергей Юрьевич']",open_sanctions


In [20]:
## Resolve duplicate entitiy IDs

In [21]:
# Find duplicate entries on the 'id' columns
redudant_entities_by_id=kb_entities[kb_entities['id'].duplicated(keep=False)].sort_values(['id','name'])

In [22]:
# Drop duplicate_entities (these were all cases where the ID was taken from Wikidata)
redudant_entities_indices=redudant_entities_by_id.index
kb_entities.drop(redudant_entities_indices, inplace=True)

In [23]:
# Combine desc across duplicated entities
redudant_entities_by_id_consolidated_desc=redudant_entities_by_id[['id','desc']].groupby(['id'])['desc'].apply(lambda x: ' '.join(x)).reset_index()

In [24]:
# Drop all duplicated apart from the first (needs to be sorted by [id, name])
redudant_entities_by_id.drop_duplicates(subset=['id'],keep='first',inplace=True)

In [25]:
# Concatenate back to kb entity dataframe 
kb_entities=pd.concat([kb_entities,redudant_entities_by_id])

In [26]:
# Cast names in lower case
kb_entities['name']=kb_entities['name'].str.lower()

In [27]:
# Drop duplicates on name and desc
kb_entities.drop_duplicates(['name','desc'],inplace=True)

In [28]:
## Clean up data

In [29]:
# Remove special characters (cyrillic, chinese, japanese, arabic, numbers) 

In [30]:
clean_vocab=pd.read_csv('clean_vocab.csv',index_col=0)
clean_vocab=set(clean_vocab['vocab'])

In [31]:
kb_entities['desc']=kb_entities['desc'].apply(lambda x: ' '.join([word for word in x.split(' ') if word in clean_vocab]))

In [32]:
kb_entities

Unnamed: 0,original_index,id,name,desc,AKA,kb_origin
0,0,acf-00040861bc3f593000830d987d09967ef3503ef1,kolyvanov egor,Russian host of news program on NTV Federal male employees media Propagandists,"['Kolyvanov Egor', 'Колыванов Егор']",open_sanctions
1,1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,shipov sergei yurievich,Russian chess chess Publicly supported war against male Warmongers Athletes,"['Shipov Sergei Yurievich', 'Шипов Сергей Юрьевич']",open_sanctions
2,2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,egorov ivan mikhailovich,Deputy of the State Council of the Republic of Tatarstan from the United Russia major friend and neighbor of the head of the hid property from declaring subjects investigations male Regional,"['Egorov Ivan Mikhailovich', 'Егоров Иван Михайлович']",open_sanctions
3,3,acf-002c208139012c8d93b6298358188d7cadafe648,goreslavsky alexey sergeyevich,Russian journalist and media Helped destroy independent media in Russia when he was appointed the new of the previously popular independent in Russian journalist and media works at the Institute for the Development of Internet that focuses on creating propoganda for the youth of Russia Head of the Internet Development Responsible for organizing censorship in the Russian segment of the Internet political involved Federal subjects Individuals Organizers male Internet in of censors Investigations corruption employees repressions media Propagandists,"['Goreslavsky Alexey Sergeyevich', 'Гореславский Алексей Сергеевич']",open_sanctions
4,4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,samoilova natalya vladimirovna,Russian Supported the actions of the Russian military during the invasion of leaders female opinion bloggers Sellout influencers and,"['Samoilova Natalya Vladimirovna', 'Самойлова Наталья Владимировна']",open_sanctions
...,...,...,...,...,...,...
131277,240626,Q6959189,nahapet gevorgyan,Armenian politician Armenians the Assembly Armenia member Armavir Gevorgyan of male National,"['Nahapet Gevorgyan', 'Նահապետ Գևորգյան', 'Геворгян, Наапет', 'Геворгян Наапет Багратович', 'Геворгян Наапет', 'Наапет Багратович Геворгян', 'Геворгян, Наапет Багратович', 'Геворгян Н. Б.', 'ناهاپت گوورگیان', 'Геворгян Н.', 'Наапет Геворгян']",open_sanctions
164305,274148,Q7281459,radnaasumbereliyn gonchigdorj,Mongolian politician the Assembly Province Arkhangai of State male Great Member Mongolia,"['Radnaasumbereliyn Gonchigdorj', '拉德那苏木贝尔勒·贡其格道尔吉', 'Раднаасумбэрэлийн Гончигдорж', 'ラドナアスムベレリン・ゴンチグドルジ', 'Раднаасүмбэрэлийн Гончигдорж']",open_sanctions
125067,234301,Q9131042,nyamaagiin enkhbold,Mongolian politician the Assembly Uliastai Defense male of State Affairs Great Minister Foreign Member Mongolia,"['Nyamaagiin Enkhbold', '尼亚马·恩赫包勒德', 'Nyamaa Enkhbold', 'Нямаагийн Энхболд']",open_sanctions
144084,253776,Q9190484,d. o. chaoke,Linguist of Tungusic languages Foreign Studies Congress Tokyo of Minzu University male China deputy National,"['D. O. Chaoke', '朝克', 'Dular Osor Chaoke', 'ドラール・オソル・朝克']",open_sanctions


In [33]:
## Resolve non-id ambiguities

In [34]:
## Add KB URLs

In [35]:
def url_generator(id_,name, dataset):
    if dataset=='open_sanctions':
        return f'https://www.opensanctions.org/entities/{id_}'
    if dataset=='lilsis':
        return f'https://littlesis.org/person/{id_}-{name}'

In [36]:
kb_entities['kb_url']=kb_entities.apply(
    lambda x: [url_generator(x['id'],x['name'], 'open_sanctions') 
                             if x['kb_origin']=='open_sanctions' 
                             else url_generator(x['id'],x['name'], 'lilsis') 
                                                  ][0],1
)

In [43]:
kb_entities[kb_entities['desc'].isna()]

Unnamed: 0,original_index,id,name,desc,AKA,kb_origin,kb_url


In [40]:
kb_entities[kb_entities.duplicated(keep=False,subset=['name'])].sort_values(by=['name']).to_csv(f'duplicate_full_name_aliases.csv')

In [44]:
kb_entities.to_csv('kb_entities_full.csv')