In [1]:
import pandas as pd
import pickle
import numpy as np
import re 
import datetime as dt

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
# Read both datasets
os_kb_entities=pd.read_csv(f'open_sanctions_entities.csv',index_col=0)
os_kb_entities['kb_origin']='open_sanctions'
ls_kb_entities=pd.read_csv(f'lilsis_entities.csv',index_col=0)
ls_kb_entities['kb_origin']='lilsis'

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
# Rename columns to concat datasets  
os_kb_entities=os_kb_entities.rename(columns={'full_notes':'desc'})
ls_kb_entities=ls_kb_entities.rename(columns={'primary_ext':'schema','end_date':'deathdate','aliases':'AKA','start_date':'birthdate','context':'desc'})

In [6]:
# Concat datasets into one
kb_entities=pd.concat([os_kb_entities,ls_kb_entities]).reset_index().rename(columns={'index':'original_index'})

In [7]:
# Drop useless columns
kb_entities.drop(['schema','notes','context','types','start_date_sentence','end_date_sentence'],1,inplace=True)

  


In [8]:
kb_entities.shape

(503310, 11)

In [9]:
# Fix trailing whitespaces
kb_entities['desc']=kb_entities['desc'].apply(lambda x: re.sub(r"\b(\.)[\.\s]+$", "\\1", x))

In [10]:
# Remove entities with no description
kb_entities=kb_entities[kb_entities['desc'].str.replace(' ','').apply(len)>0]

In [11]:
# Drop duplicated based on same name and description
kb_entities.drop_duplicates(subset=['name','desc'],inplace=True)

In [12]:
kb_entities.shape

(429953, 11)

In [13]:
## Resolve duplicate entitiy IDs

In [14]:
# Find duplicate entries on the 'id' columns
redundant_entities_by_id=kb_entities[kb_entities['id'].duplicated(keep=False)].sort_values(['id','name'])

In [15]:
# Drop duplicate_entities (these were all cases where the ID was taken from Wikidata)
redundant_entities_indices=redundant_entities_by_id.index
kb_entities.drop(redundant_entities_indices, inplace=True)

In [16]:
# Keep first of duplicated entities
redundant_entities_by_id_consolidated_desc=redundant_entities_by_id.groupby(['id']).first().reset_index()

In [17]:
# Concatenate back to kb entity dataframe 
kb_entities=pd.concat([kb_entities,redundant_entities_by_id_consolidated_desc])
kb_entities.reset_index(drop=True,inplace=True)

In [18]:
kb_entities.shape

(429861, 11)

In [19]:
## Resolve duplicates on birthdate, deathdate and website

In [20]:
# Create auxiliary column for ordering based on len of desc field
kb_entities['desc_len']=kb_entities['desc'].str.len()

In [21]:
redundancy_cols=['birthdate','deathdate','website']
for col in redundancy_cols:
    # Find duplicates ordered by description len
    redundant_entities_by_col=kb_entities[~(kb_entities[col].isna())&
                (kb_entities.duplicated(['name',col],keep=False))
               ].sort_values(by=['name','desc_len'],ascending=False)

    # Drop duplicate_entities on name and birthdate
    redundant_entities_indices=redundant_entities_by_col.index
    kb_entities.drop(redundant_entities_indices, inplace=True)

    # Keep first of duplicated entities
    redundant_entities_by_col_consolidated_desc=redundant_entities_by_col.groupby(['name',col]).first().reset_index()

    # Concatenate back to kb entity dataframe 
    kb_entities=pd.concat([kb_entities,redundant_entities_by_col_consolidated_desc])
    kb_entities.reset_index(drop=True,inplace=True)

In [22]:
kb_entities.shape

(428519, 12)

In [23]:
## Further duplicate inspection 

In [24]:
# Duplicates on name
kb_duplicated=kb_entities[kb_entities.duplicated(keep=False,subset=['name'])].sort_values(by=['name'])

In [25]:
kb_entities.shape

(428519, 12)

In [26]:
# Standardise names 
kb_entities['name']=kb_entities['name'].str.title()

In [27]:
# Ensure person name is included in all descriptions
name_not_in_notes_indices=kb_entities[kb_entities.apply(lambda x: x['name'].lower() not in x['desc'].lower(),axis=1)].index.values
naming_string = 'This person is called '
kb_entities.loc[name_not_in_notes_indices,'desc'] = kb_entities.loc[name_not_in_notes_indices].apply(lambda x: naming_string + x['name'] + '. ' + x['desc'],axis=1)

In [28]:
# Ensure descriptions end in stop mark. 
kb_entities.loc[~kb_entities['desc'].isna(),'desc']=kb_entities.loc[~kb_entities['desc'].isna(),'desc'].apply(lambda x: x + '.' if x[-1]!='.' else x)

In [29]:
# Clean up data by removing multiple trailing stop marks
multi_stopmarks_expr = re.compile('\.\s?\.')

for expr in [multi_stopmarks_expr]:
    # Replace expression in string
    kb_entities.loc[~kb_entities['desc'].isna(),'desc']=kb_entities.loc[~kb_entities['desc'].isna(),'desc'].apply(lambda x: ''.join([re.sub(expr, '. ', x)]))

In [30]:
## Add KB URLs

In [31]:
def url_generator(id_,name, dataset):
    if dataset=='open_sanctions':
        return f'https://www.opensanctions.org/entities/{id_}'
    if dataset=='lilsis':
        return f'https://littlesis.org/person/{id_}-{name}'

In [32]:
kb_entities['kb_url']=kb_entities.apply(
    lambda x: [url_generator(x['id'],x['name'], 'open_sanctions') 
                             if x['kb_origin']=='open_sanctions' 
                             else url_generator(x['id'],x['name'], 'lilsis') 
                                                  ][0],1
)

In [33]:
dataset='full'
kb_iteration=dt.datetime.now().strftime('%Y_%m_%d')
dataset=f'{dataset}_{kb_iteration}'
kb_entities.to_csv(f'kb_entities_{dataset}.csv')

In [34]:
data_26=pd.read_csv('kb_entities_full_2022_10_26.csv')
data_07=pd.read_csv('kb_entities_full_2022_11_07.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
data_26.shape

(428519, 14)

In [36]:
data_07.shape

(428519, 14)

In [41]:
merge=data_26[['id','desc']].merge(data_07[['id','desc']],on=['id'],how='outer')

In [42]:
merge[(merge['desc_x']!=merge['desc_y'])]

Unnamed: 0,id,desc_x,desc_y
0,acf-00040861bc3f593000830d987d09967ef3503ef1,"Kolyvanov Egor is a Russian propagandist: host of news program ""Segodnia"" (""Today"") on NTV. This person has held these positions: Propagandists, Federal media employees. This person is a male. This person was born in 1980-11-15.","Kolyvanov Egor is a Russian propagandist: host of news program ""Segodnia"" (""Today"") on NTV. This person has held these positions: Federal media employees, Propagandists. This person is a male. This person was born in 1980-11-15."
3,acf-002c208139012c8d93b6298358188d7cadafe648,"Goreslavsky Alexey Sergeyevich is a Russian journalist and media manager. Helped destroy independent media in Russia when he was appointed the new editor-in-chief of the previously popular independent Lenta.ru in 2014 . Russian propagandist: journalist and media manager; works at the state-owned Institute for the Development of Internet that focuses on creating propoganda for the youth of Russia Head of the Internet Development Institute. Responsible for organizing censorship in the Russian segment of the Internet. This person has held these positions: Propagandists, Investigations subjects, Internet censors, Federal media employees, Individuals involved in corruption, Organizers of political repressions. This person is a male. This person was born in 1977-07-13.","Goreslavsky Alexey Sergeyevich is a Russian journalist and media manager. Helped destroy independent media in Russia when he was appointed the new editor-in-chief of the previously popular independent Lenta.ru in 2014 . Russian propagandist: journalist and media manager; works at the state-owned Institute for the Development of Internet that focuses on creating propoganda for the youth of Russia Head of the Internet Development Institute. Responsible for organizing censorship in the Russian segment of the Internet. This person has held these positions: Individuals involved in corruption, Organizers of political repressions, Federal media employees, Investigations subjects, Internet censors, Propagandists. This person is a male. This person was born in 1977-07-13."
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,"Samoilova Natalya Vladimirovna is a Russian singer, composer. Supported the actions of the Russian military during the invasion of Ukraine. This person has held these positions: Celebrities influencers and bloggers, Sellout opinion leaders. This person is a female. This person was born in 1987-06-24.","Samoilova Natalya Vladimirovna is a Russian singer, composer. Supported the actions of the Russian military during the invasion of Ukraine. This person has held these positions: Sellout opinion leaders, Celebrities influencers and bloggers. This person is a female. This person was born in 1987-06-24."
12,acf-00776ee02694fb36ef3d6f42fa95ddcdab7f76c8,"Vasilev Vladimir Lvovich is a Director and Editor-in-Chief of the newspaper ""Sovetskaia Chuvashiia"" (owned by the Government of the Chuvash Republic). This person has held these positions: Propagandists, Regional press. This person is a male.","Vasilev Vladimir Lvovich is a Director and Editor-in-Chief of the newspaper ""Sovetskaia Chuvashiia"" (owned by the Government of the Chuvash Republic). This person has held these positions: Regional press, Propagandists. This person is a male."
13,acf-008076c876c08dd7c54a05cbec8f82eeaa0af206,"Semchenkov Iurii Evgenevich is a Editor-in-Chief of the newspaper ""Smolenskaia gazeta"" (owned by the Government of the Smolensk region). This person has held these positions: Propagandists, Regional press. This person is a male.","Semchenkov Iurii Evgenevich is a Editor-in-Chief of the newspaper ""Smolenskaia gazeta"" (owned by the Government of the Smolensk region). This person has held these positions: Regional press, Propagandists. This person is a male."
...,...,...,...
428463,Q98819671,"Vadim Dmitrievich Ipatov is a Deputy Chairperson, Central Electoral Commission (CEC). As a Member of the CEC, he was responsible for the violations of international electoral standards in the Presidential elections on 19 Dec 2010 and in the Parliamentary elections of September 2012. Travel ban according to article 3 paragraph 1 and financial sanctions according to article 1 do not apply until 15 March 2016. En tant que vice-président de la CEC, il est responsable des fautes commises par la CEC au cours du processus électoral présidentiel de 2020, de la non-conformité de ce processus avec les règles internationales élémentaires d’équité et de transparence, et de la falsification par la CEC des résultats du scrutin. La CEC et ses dirigeants ont notamment organisé le rejet de certains candidats de l’opposition pour des motifs fallacieux, ainsi que la mise en place de restrictions disproportionnées pour les observateurs dans les bureaux de vote. La CEC a également veillé à ce que la composition des commissions électorales sous sa supervision soit déséquilibrée. This person has held these positions: Deputy Chairman of the Central Elec­toral Commission CEC, Vice-président de la commission électorale centrale CEC, Центральная комиссия Республики Беларусь по выборам член комиссии заместитель председателя, First Deputy Head of the Chief State legal department of the administration of the President of Belarus, Central Election Commission of the Republic of Belarus member of the commission Vice Chairperson. This person is a male. This person was born in 1964-10-30. This person belongs to these countries: Belarus,Ukraine. Associated with Politician. Associated with Sanctioned entity. This person was born in Коломыя, Ивано-Франковская область, Украина, Kolomyia, Ivano-Frankivsk Region, Ukraine, Kolomyia, Kolomyya, Ukraine, Kolomyia, oblast d'Ivano-Frankivsk. This person has these nationalities: Belarus.","Vadim Dmitrievich Ipatov is a Deputy Chairperson, Central Electoral Commission (CEC). As a Member of the CEC, he was responsible for the violations of international electoral standards in the Presidential elections on 19 Dec 2010 and in the Parliamentary elections of September 2012. Travel ban according to article 3 paragraph 1 and financial sanctions according to article 1 do not apply until 15 March 2016. En tant que vice-président de la CEC, il est responsable des fautes commises par la CEC au cours du processus électoral présidentiel de 2020, de la non-conformité de ce processus avec les règles internationales élémentaires d’équité et de transparence, et de la falsification par la CEC des résultats du scrutin. La CEC et ses dirigeants ont notamment organisé le rejet de certains candidats de l’opposition pour des motifs fallacieux, ainsi que la mise en place de restrictions disproportionnées pour les observateurs dans les bureaux de vote. La CEC a également veillé à ce que la composition des commissions électorales sous sa supervision soit déséquilibrée. This person has held these positions: Deputy Chairman of the Central Elec­toral Commission CEC, Центральная комиссия Республики Беларусь по выборам член комиссии заместитель председателя, Vice-président de la commission électorale centrale CEC, First Deputy Head of the Chief State legal department of the administration of the President of Belarus, Central Election Commission of the Republic of Belarus member of the commission Vice Chairperson. This person is a male. This person was born in 1964-10-30. This person belongs to these countries: Belarus,Ukraine. Associated with Politician. Associated with Sanctioned entity. This person was born in Коломыя, Ивано-Франковская область, Украина, Kolomyia, Ivano-Frankivsk Region, Ukraine, Kolomyia, Kolomyya, Ukraine, Kolomyia, oblast d'Ivano-Frankivsk. This person has these nationalities: Belarus."
428467,Q7913689,"Van Taylor is a Businessman and politician. This person has held these positions: United States representative 2021-, United States representative , member of the State Senate of Texas, House of Representatives member 2019-, member of the Texas House of Representatives. This person is a male. This person was born in 1972-08-01. This person belongs to these countries: United States. Associated with Politician. This person was born in Dallas. This person has these nationalities: United States.","Van Taylor is a Businessman and politician. This person has held these positions: United States representative 2021-, member of the Texas House of Representatives, House of Representatives member 2019-, member of the State Senate of Texas, United States representative . This person is a male. This person was born in 1972-08-01. This person belongs to these countries: United States. Associated with Politician. This person was born in Dallas. This person has these nationalities: United States."
428474,Q470975,"Vladimir Lisin is a Russian businessman It is in first place in the ranking of Russian billionaires according to Forbes. Performs commercial activities in sectors of the economy that provide a high revenue part of the budget of the Russian Federation, which is responsible for the war in Ukraine, that is, it is a significant source of income for waging war. Has close ties with the regime, which prepared and is being conducted contrary to the statutory documents of the United Nations Nations war of conquest against Ukraine, in which crimes against humanity are committed and the genocide of the Ukrainian people takes place. Thus, the subject is responsible for material or financial support for actions that undermine or threaten the territorial integrity, sovereignty and independence of Ukraine. Знаходиться на першому місці у рейтингу російських мільярдерів за версією Форбс. Здійснює комерційну діяльність у секторах економіки, що забезпечують високу доходну частину бюджету Російської Федерації, яка відповідає за війну в Україні, тобто є значним джерелом доходів для ведення війни. Має тісні зв'язки з режимом, яким підготовлено та ведеться всупереч статутним документам Організації Об'єднаних Націй загарбницька війна проти України, в рамках якої скоюються злочини проти людяності та відбувається геноцид українського народу. Таким чином, суб'єкт несе відповідальність за матеріальну чи фінансову підтримку дій, що підривають або загрожують територіальній цілісності, суверенітету та незалежності України. Находится на первом месте в рейтинге российских миллиардеров по версии Форбс. Совершает коммерческую деятельность в секторах экономики, обеспечивающих высокую доходную часть бюджета Российской Федерации, которая несет ответственность за войну в Украине, то есть является значительным источником доходов для ведения войны. Имеет тесные связи с режимом, которым подготовлена и ведется вопреки уставным документам Организации Объединенных Наций захватническая война против Украины, в рамках которой совершаются преступления против человечности и происходит геноцид украинского народа. Таким образом, субъект несет ответственность за материальную или финансовую поддержку действий, которые подрывают или угрожают территориальной целостности, суверенитету и независимости Украины. Russian oligarch. Amid the war, he remains involved in the system of Russian political corruption, in which capital is an instrument for acquiring political influence, and political influence is a condition for preserving capital, at the highest level. Chairman and majority shareholder of Novolipetsk Steel (NLMK Group). This person has held these positions: president 2018-, president , Oligarchs, Председатель совета директоров Новолипецкого металлургического комбината предприниматель миллиардер 1 место в Форбс РФ 2022 основными активами которого являются Новолипецкий металлургический комбинат Первая грузовая компания Первая Портовая Компания Судоходная компания «Волжское пароходство», Голова ради директорів Новолипецького металургійного комбінату підприємець мільярдер 1 місце у Форбс РФ 2022 основними активами якого є Новолипецький металургійний комбінат Перша вантажна компанія Перша Портова Компанія Судноплавна компанія «Волзьке пароплавство», Chairman of the Board of Directors of Novolipetsk Iron and Steel Works entrepreneur billionaire 1st place in Forbes RF 2022 whose main assets are Novolipetsk Iron and Steel Works First Freight Company First Port Company Shipping Company ""Volga Shipping Company"", Individuals involved in corruption, chief executive officer. This person is a male. This person was born in 1956-05-07. This person belongs to these countries: Russian Federation. Associated with Oligarch. Associated with Sanctioned entity. This person was born in Ivanovo, Russia, Иваново, Ивановская обл. , РСФСР, СССР, Ivanovo, Іванове, Іванівська обл. , РРФСР, СРСР, Ivanovo, Ivanovo region, RSFSR, USSR. This person has these nationalities: Russian Federation.","Vladimir Lisin is a Russian businessman It is in first place in the ranking of Russian billionaires according to Forbes. Performs commercial activities in sectors of the economy that provide a high revenue part of the budget of the Russian Federation, which is responsible for the war in Ukraine, that is, it is a significant source of income for waging war. Has close ties with the regime, which prepared and is being conducted contrary to the statutory documents of the United Nations Nations war of conquest against Ukraine, in which crimes against humanity are committed and the genocide of the Ukrainian people takes place. Thus, the subject is responsible for material or financial support for actions that undermine or threaten the territorial integrity, sovereignty and independence of Ukraine. Знаходиться на першому місці у рейтингу російських мільярдерів за версією Форбс. Здійснює комерційну діяльність у секторах економіки, що забезпечують високу доходну частину бюджету Російської Федерації, яка відповідає за війну в Україні, тобто є значним джерелом доходів для ведення війни. Має тісні зв'язки з режимом, яким підготовлено та ведеться всупереч статутним документам Організації Об'єднаних Націй загарбницька війна проти України, в рамках якої скоюються злочини проти людяності та відбувається геноцид українського народу. Таким чином, суб'єкт несе відповідальність за матеріальну чи фінансову підтримку дій, що підривають або загрожують територіальній цілісності, суверенітету та незалежності України. Находится на первом месте в рейтинге российских миллиардеров по версии Форбс. Совершает коммерческую деятельность в секторах экономики, обеспечивающих высокую доходную часть бюджета Российской Федерации, которая несет ответственность за войну в Украине, то есть является значительным источником доходов для ведения войны. Имеет тесные связи с режимом, которым подготовлена и ведется вопреки уставным документам Организации Объединенных Наций захватническая война против Украины, в рамках которой совершаются преступления против человечности и происходит геноцид украинского народа. Таким образом, субъект несет ответственность за материальную или финансовую поддержку действий, которые подрывают или угрожают территориальной целостности, суверенитету и независимости Украины. Russian oligarch. Amid the war, he remains involved in the system of Russian political corruption, in which capital is an instrument for acquiring political influence, and political influence is a condition for preserving capital, at the highest level. Chairman and majority shareholder of Novolipetsk Steel (NLMK Group). This person has held these positions: Individuals involved in corruption, president , Chairman of the Board of Directors of Novolipetsk Iron and Steel Works entrepreneur billionaire 1st place in Forbes RF 2022 whose main assets are Novolipetsk Iron and Steel Works First Freight Company First Port Company Shipping Company ""Volga Shipping Company"", Председатель совета директоров Новолипецкого металлургического комбината предприниматель миллиардер 1 место в Форбс РФ 2022 основными активами которого являются Новолипецкий металлургический комбинат Первая грузовая компания Первая Портовая Компания Судоходная компания «Волжское пароходство», Голова ради директорів Новолипецького металургійного комбінату підприємець мільярдер 1 місце у Форбс РФ 2022 основними активами якого є Новолипецький металургійний комбінат Перша вантажна компанія Перша Портова Компанія Судноплавна компанія «Волзьке пароплавство», Oligarchs, chief executive officer, president 2018-. This person is a male. This person was born in 1956-05-07. This person belongs to these countries: Russian Federation. Associated with Oligarch. Associated with Sanctioned entity. This person was born in Ivanovo, Russia, Иваново, Ивановская обл. , РСФСР, СССР, Ivanovo, Іванове, Іванівська обл. , РРФСР, СРСР, Ivanovo, Ivanovo region, RSFSR, USSR. This person has these nationalities: Russian Federation."
428489,Q14520235,"Wopke Hoekstra is a Dutch politician. This person has held these positions: Minister of Finance , member of the Senate of the Netherlands , Minister of Foreign Affairs 2022-, Deputy Prime Minister of the Netherlands 2022-, member of the House of Representatives of the Netherlands . This person is a male. This person was born in 1975-09-30. This person belongs to these countries: Netherlands. Associated with Politician. This person was born in Ede. This person has these nationalities: Netherlands.","Wopke Hoekstra is a Dutch politician. This person has held these positions: Minister of Finance , member of the Senate of the Netherlands , Minister of Foreign Affairs 2022-, member of the House of Representatives of the Netherlands , Deputy Prime Minister of the Netherlands 2022-. This person is a male. This person was born in 1975-09-30. This person belongs to these countries: Netherlands. Associated with Politician. This person was born in Ede. This person has these nationalities: Netherlands."
