# Create spaCy KB object

In [1]:
import pickle
from pathlib import Path
import datetime as dt
import logging
import numpy as np
import pandas as pd
import spacy
from spacy.kb import KnowledgeBase

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def setup_logger(name, log_file, level=logging.DEBUG):
    """
    Setup logger
    """
    handler = logging.FileHandler(log_file)        

    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)

    return logger

def days_hours_minutes(td):
    """
    Extract hours, minutes and seconds from timestamp
    """
    hours = td.seconds//3600
    minutes=(td.seconds//60)%60
    seconds=td.seconds
    return f'{hours}:{minutes}:{seconds}' 

## Load spaCy pipeline

In [4]:
# Load spaCy en_web_core_lg pipeline
model='lg'
nlp = spacy.load(f'en_core_web_lg')

text = "Example text to embbed and assess vector dimensions"
doc = nlp(text)

# Find nlp model embedding dimensions
embedding_dims=len(doc.vector)

In [5]:
print(embedding_dims)

300


## Load dataset

In [6]:
dataset='full'# OR'open_sanctions'# OR 'lilsis'
kb_iteration='_2022_11_07'

In [7]:
# Import kb dataset
data=pd.read_csv(f'../assets/kb_entities_{dataset}{kb_iteration}.csv',index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
data.shape

(428519, 13)

In [9]:
# Select the three required KB fields
kb_data=data[['id','name','desc']]

In [10]:
kb_data.head(2)

Unnamed: 0,id,name,desc
0,acf-00040861bc3f593000830d987d09967ef3503ef1,Kolyvanov Egor,Kolyvanov Egor is a Russian propagandist: host...
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Shipov Sergei Yurievich,Shipov Sergei Yurievich is a Russian chess pla...


In [11]:
# Raise exception if duplicated IDs are found in the dataset
if (kb_data['id'].duplicated().any()) | (kb_data['id'].nunique()!=kb_data.shape[0]):
    raise Exception('There might be duplicate entitiy IDs in the KB file.')

In [12]:
# Convert dataframe into dictionaries mapping IDs to aliases and descriptions
names_dict = dict()
desc_dict = dict()

for row in kb_data.iterrows():
    qid = str(row[1][0])
    name = str(row[1][1])
    desc = str(row[1][2])
    names_dict[qid] = name
    desc_dict[qid] = desc

## Map many-to-one IDs to aliases 

In [13]:
aliases_data=kb_data[kb_data['name'].duplicated(keep=False)].sort_values(['name'])
aliases_data['id']=aliases_data['id'].astype(str)
alias_dict={}
for alias in aliases_data['name'].unique():
    alias_dict[alias]=list(aliases_data.loc[aliases_data['name']==alias, 'id'].values)

In [14]:
# entity IDs sharing alias David Smith
alias_dict['David Smith']

['280783',
 '211703',
 '53881',
 '200407',
 '377020',
 '204251',
 '184041',
 '77215',
 'Q3018800',
 '221595',
 'Q53960880',
 'Q5239878',
 '200405']

In [15]:
# export kb data in right format for tutorialkb_data
kb_data.rename(columns={'id':'qid','context':'desc'})

Unnamed: 0,qid,name,desc
0,acf-00040861bc3f593000830d987d09967ef3503ef1,Kolyvanov Egor,Kolyvanov Egor is a Russian propagandist: host...
1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,Shipov Sergei Yurievich,Shipov Sergei Yurievich is a Russian chess pla...
2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,Egorov Ivan Mikhailovich,Egorov Ivan Mikhailovich is a Deputy of the St...
3,acf-002c208139012c8d93b6298358188d7cadafe648,Goreslavsky Alexey Sergeyevich,Goreslavsky Alexey Sergeyevich is a Russian jo...
4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,Samoilova Natalya Vladimirovna,Samoilova Natalya Vladimirovna is a Russian si...
...,...,...,...
428514,413772,Cory Bernardi,Cory Bernardi is a Australian politician and r...
428515,Q47668202,Jalbasürengiin Batzandan,Jalbasürengiin Batzandan is a Mongolian politi...
428516,13488,Patrick Murphy,Patrick Murphy is a former US Representative f...
428517,Q28033808,Sharif Street,Sharif Street is a American politician from Pe...


## Embbed and export kb descriptions

In [16]:
embedings_path='../assets/'

In [17]:
# create file handler which logs even debug messages
cycle_logger=setup_logger(f'{embedings_path}/{model}_model_cycle_logger',f'../assets/{model}_model_cycle_execution_times.log')
# create file handler which logs failed embeddings
failed_embeddings_logger=setup_logger(f'{embedings_path}/{model}_model_failed_embeddings',f'../assets/{model}_model_embeddings.log')

In [18]:
# Embbed and export kb descriptions
cycle_logger.debug('------')
cycle_logger.debug('Embeddings run start')
cycle_logger.debug('------')

start_runtime=dt.datetime.now()
num_embeddings=0
runtime_log=start_runtime.strftime('%Y_%m_%d_%H:%M:%S')
log_str=f'Number of embeddings:{num_embeddings}, Runtime:{runtime_log}'
cycle_logger.debug(log_str)
descriptions_emb = dict()
export_cycle_interval=10000
filename=f'{embedings_path}/{model}_model_embeddings_dict_pkl'
for qid, desc in desc_dict.items():
    try:
        desc_doc = nlp(desc)
        desc_emb = desc_doc.vector
        descriptions_emb[qid]=desc_emb
    except:
        failed_embeddings_logger.debug(f'{qid}')
        continue
    num_embeddings+=1    
    if (num_embeddings%export_cycle_interval==0) | (num_embeddings==len(desc_dict)):
        
        # Export embeddings
        with open(f'{filename}{kb_iteration}.pickle', 'wb') as handle:
            pickle.dump(descriptions_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        # Logging
        current_runtime = dt.datetime.now()
        runtime_log = current_runtime.strftime('%Y_%m_%d_%H:%M:%S')
        time_delta = current_runtime - start_runtime 
        time_delta = days_hours_minutes(time_delta)
        log_str=f'Number of embeddings:{num_embeddings}, Runtime:{runtime_log}, Timedelta:{time_delta}'
        cycle_logger.debug(log_str)
        
cycle_logger.debug('------')
cycle_logger.debug('Embeddings run end')
cycle_logger.debug('------')

In [19]:
print('Done')

Done


## Create Knowledge Base

In [20]:
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=embedding_dims)

In [21]:
# Open embeddings export file 
filename=f'{embedings_path}/{model}_model_embeddings_dict_pkl'
with open(f'{filename}{kb_iteration}.pickle', 'rb') as f:
    description_emb = pickle.load(f)

In [22]:
len(description_emb)

428519

In [23]:
len(description_emb)==data.shape[0]

True

In [24]:
# Add entities, embbeded descriptions and word counts to KB
arbitrary_freq_value=342
for qid, vector in description_emb.items():
    kb.add_entity(entity=qid, entity_vector=vector, freq=arbitrary_freq_value)

In [25]:
arbitrary_prob=0.000001 
for qid, name in names_dict.items():
    if name not in alias_dict.keys():
        kb.add_alias(alias=str(name), entities=[str(qid)], probabilities=[arbitrary_prob])   # 100% prior probability P(entity|alias)

In [26]:
for alias_ in alias_dict.keys():
    qids=alias_dict[alias_]
    # change to 0 still not implemented in KB objects
    #probs = [round(1/len(qids),2)-.01 for qid in qids]
    probs=[arbitrary_prob for qid in qids]
    kb.add_alias(alias=alias_, entities=qids, probabilities=probs)  # sum([probs]) should be <= 1 !

In [27]:
# Sense checks for aliases/entity mappings

In [28]:
candidate_1='Joe Biden'
print(f"Candidates for {candidate_1}: {[c.entity_ for c in kb.get_alias_candidates(candidate_1)]}")

Candidates for Joe Biden: ['13047']


In [29]:
candidate_2='Adam Smith'
print(f"Candidates for {candidate_2}: {[c.entity_ for c in kb.get_alias_candidates(candidate_2)]}")

Candidates for Adam Smith: ['129552', '379819', '269916', '256328', '13596']


In [30]:
candidate_3='David Smith'
print(f"Candidates for {candidate_3}: {[c.entity_ for c in kb.get_alias_candidates(candidate_3)]}")

Candidates for David Smith: ['280783', '211703', '53881', '200407', '377020', '204251', '184041', '77215', 'Q3018800', '221595', 'Q53960880', 'Q5239878', '200405']


# Save KB object to disk

In [31]:
# Save KB object to disk
output_dir="../assets/"
kb.to_disk(f'{output_dir}kb_{model}_model{kb_iteration}')