In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
# Select the intended kb dataset
dataset='lilsis'#'open_sanctions'

In [5]:
# Read csv file 
kb_entities=pd.read_csv(f'{dataset}_entities.csv',index_col=0)

In [6]:
# Select only KB entries with a person entity
person_named_entities_name='Person'
person_named_entities_col_d={'open_sanctions':'schema', 'lilsis':'primary_ext'} 
kb_entities=kb_entities[
    kb_entities[
        person_named_entities_col_d[dataset]]==person_named_entities_name
]

In [9]:
# Change column names
desc_col_d={'open_sanctions':'full_notes', 'lilsis':'context'} 
desc_col=desc_col_d[dataset]
kb_entities=kb_entities.rename(columns={desc_col:'desc'})

In [10]:
# Drop entities without a description 
kb_entities.dropna(subset=['desc'],inplace=True)

In [14]:
# Reorder columns
if dataset!='open_sanctions':
    kb_entities=kb_entities.rename(columns={'aliases':'AKA'})
kb_entities=kb_entities[['id','name','desc', 'AKA']]

In [16]:
# Remove cyrillic 
cyrillic = "вгдеёзийклмнопрстуфхъыьэАБВГДЕЁЗИЙКЛМНОПРСТУФХЪЫЬЭ"
for i,symbol in enumerate(cyrillic):
    cyrillic_condition=(kb_entities['name'].str.contains(symbol))
    if i==0:
        cyrillic_df=kb_entities[cyrillic_condition]
    else:
        cyrillic_df=cyrillic_df.append(kb_entities[cyrillic_condition]) 
        cyrillic_df=cyrillic_df.drop_duplicates()
kb_entities=kb_entities.drop(cyrillic_df.index)

In [18]:
kb_entities.to_csv(f'../kb_datasets/kb_entities_{dataset}.csv')

In [306]:
# Read both datasets
os_kb_entities=pd.read_csv(f'kb_entities_open_sanctions.csv',index_col=0)
ls_kb_entities=pd.read_csv(f'kb_entities_lilsis.csv',index_col=0)

In [25]:
# Combine datasets into one
kb_entities=pd.concat([os_kb_entities,ls_kb_entities]).reset_index().rename(columns={'index':'original_index'})

In [26]:
kb_entities.shape

(360107, 5)

In [36]:
kb_entities.head(2)

Unnamed: 0,original_index,id,name,desc,AKA
0,0,acf-00040861bc3f593000830d987d09967ef3503ef1,Kolyvanov Egor,"Russian propagandist: host of news program ""Segodnia"" (""Today"") on NTV Propagandists Federal media employees male 1980-11-15 Kolyvanov Egor Колыванов Егор","['Kolyvanov Egor', 'Колыванов Егор']"
1,1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Shipov Sergei Yurievich,"Russian chess player, grandmaster, chess coach, commentator. Publicly supported Russia's war against Ukraine. Warmongers Athletes male 1966-04-17 Shipov Sergei Yurievich Шипов Сергей Юрьевич","['Shipov Sergei Yurievich', 'Шипов Сергей Юрьевич']"


In [None]:
## Resolve duplicate entitiy IDs

In [60]:
# Find duplicate entries on the 'id' columns
redudant_entities_by_id=kb_entities[kb_entities['id'].duplicated(keep=False)].sort_values(['id','name'])

In [70]:
# Drop duplicate_entities
redudant_entities_indices=redudant_entities_by_id.index
kb_entities.drop(redudant_entities_indices, inplace=True)

In [53]:
# Combine desc across duplicated entities
redudant_entities_by_id_consolidated_desc=redudant_entities_by_id[['id','desc']].groupby(['id'])['desc'].apply(lambda x: ' '.join(x)).reset_index()

In [74]:
# Drop all duplicated apart from the first (needs to be sorted by [id, name])
redudant_entities_by_id.drop_duplicates(subset=['id'],keep='first',inplace=True)

In [78]:
# Concatenate back to kb entity dataframe 
kb_entities=pd.concat([kb_entities,redudant_entities_by_id])

In [117]:
# Cast names in lower case
kb_entities['name']=kb_entities['name'].str.lower()

In [246]:
# Drop duplicates on name and desc
kb_entities.drop_duplicates(['name','desc'],inplace=True)

In [316]:
kb_entities['name']=kb_entities['name_backup']

In [253]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [283]:
from numpy import dot
from numpy.linalg import norm
def calculate_cosine_similarity(descriptions_vec,vector_ref_sentence):
    """
    Return a dictionary mapping the kb entity id to cosine similarity score
    between kb embedded descriptions and the reference vector.
    """

    score=np.nan_to_num(
        dot(vector_ref_sentence, descriptions_vec)/
        (norm(vector_ref_sentence)*norm(descriptions_vec))
    ,0)
    return score

In [330]:
kb_entities['desc_enc']=kb_entities['desc'].apply(lambda x: nlp(x).vector)

In [339]:
kb_entities.loc[kb_entities['name'].duplicated(keep=False),'name'].value_counts()

david wilson          13
david smith           13
mark smith            13
john williams         12
david anderson        11
                      ..
naruhito               2
karina gould           2
jonathan wilkinson     2
rudolf müller          2
mark dente             2
Name: name, Length: 11495, dtype: int64

In [434]:
for i,name in enumerate(kb_entities['name'].unique()): 
    df=kb_entities[kb_entities['name']==name]
    df=df[['name','id','desc','desc_enc']]
    df=df.merge(df, on='name', suffixes=['_1','_2'])
    df['similarity_score']=df.apply(
        lambda x: calculate_cosine_similarity(x['desc_enc_1'],x['desc_enc_2']),
            1
        )
    df=df[df['similarity_score']>0]
    if i==0:
        similarity_df=df[['id_1','id_2','similarity_score']]
    else:
        similarity_df=pd.concat([similarity_df, df[['id_1','id_2','similarity_score']]])
        
similarity_df=similarity_df[(similarity_df['id_1']!=similarity_df['id_2'])]

  # This is added back by InteractiveShellApp.init_path()


In [None]:
# The merging the ids agains themselves creates deplicates based on order 
# (id 1, id 2 or id 2, id 1 doesn't matter, just the pairing of the set)
# This code removes redudant rows
similarity_df['id_1']=similarity_df['id_1'].astype(str)
similarity_df['id_2']=similarity_df['id_2'].astype(str)
similarity_df['id_pair']=similarity_df.apply(lambda x: ' '.join(set([x['id_1'],x['id_2']])),1)
similarity_df=similarity_df.drop_duplicates('id_pair',keep='first')

In [469]:
# Remove cyrillic 
cyrillic = "вгдеёзийклмнопрстуфхъыьэАБВГДЕЁЗИЙКЛМНОПРСТУФХЪЫЬЭжцчщюяєії"
for symbol in cyrillic:
    similarity_df['desc_1']=similarity_df['desc_1'].str.replace(symbol,'')
    similarity_df['desc_2']=similarity_df['desc_2'].str.replace(symbol,'')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [442]:
# Fetch the context for each id
similarity_df=similarity_df.merge(kb_entities[['name','id','desc']], how='left',left_on='id_1', right_on='id')
similarity_df.drop('id',1,inplace=True)
similarity_df=similarity_df.rename(columns={'name':'name_1','desc':'desc_1'})
similarity_df=similarity_df.merge(kb_entities[['name','id','desc']], how='left',left_on='id_2', right_on='id')
similarity_df.drop('id',1,inplace=True)
similarity_df=similarity_df.rename(columns={'name':'name_2','desc':'desc_2'})

  
  """


In [471]:
similarity_df.to_csv('similarity_df.csv')

In [128]:
kb_entities[kb_entities.duplicated(keep=False,subset=['name'])].sort_values(by=['name']).to_csv(f'{dataset}_full_name_aliases.csv')

In [129]:
kb_entities['first_name']=kb_entities['name'].apply(lambda x: x.split(' ')[0])

In [130]:
kb_entities['last_name']=kb_entities['name'].apply(lambda x: x.split(' ')[-1])

In [131]:
kb_entities[kb_entities.duplicated(
    keep=False,subset=['first_name','last_name'])]\
    .sort_values(by=['first_name','last_name']).to_csv(f'{dataset}_first_and_last_name_aliases.csv')

In [None]:
kb_entities[kb_entities.isna().any(1)]

Unnamed: 0,id,name,desc


In [None]:
kb_entities.to_csv('kb_entities.csv')

In [139]:
data=pd.read_csv('open_sanctions_entities.csv',index_col=0)

In [140]:
data.head()

Unnamed: 0,id,schema,name,notes,AKA,context,full_notes
0,acf-00040861bc3f593000830d987d09967ef3503ef1,Person,Kolyvanov Egor,"Russian propagandist: host of news program ""Se...","['Kolyvanov Egor', 'Колыванов Егор']",Propagandists Federal media employees male 198...,"Russian propagandist: host of news program ""Se..."
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Person,Shipov Sergei Yurievich,"Russian chess player, grandmaster, chess coach...","['Shipov Sergei Yurievich', 'Шипов Сергей Юрье...",Warmongers Athletes male 1966-04-17,"Russian chess player, grandmaster, chess coach..."
2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,Person,Egorov Ivan Mikhailovich,Deputy of the State Council of the Republic of...,"['Egorov Ivan Mikhailovich', 'Егоров Иван Миха...",Regional investigations subjects male 1961-01-21,Deputy of the State Council of the Republic of...
3,acf-002c208139012c8d93b6298358188d7cadafe648,Person,Goreslavsky Alexey Sergeyevich,Russian journalist and media manager. Helped d...,"['Goreslavsky Alexey Sergeyevich', 'Гореславск...",Propagandists Organizers of political repressi...,Russian journalist and media manager. Helped d...
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,Person,Samoilova Natalya Vladimirovna,"Russian singer, composer. Supported the action...","['Samoilova Natalya Vladimirovna', 'Самойлова ...","Sellout opinion leaders Celebrities, influence...","Russian singer, composer. Supported the action..."


In [141]:
data.dropna(subset=['notes'])['schema'].value_counts()

Person          161532
Company           3374
Organization      2017
LegalEntity       1217
Vessel             509
Airplane           269
Name: schema, dtype: int64