In [1]:
import sys
sys.path.insert(0,'..')

In [2]:
import numpy as np
import pandas as pd
import spacy
import pickle
import datetime as dt
import logging
from gu_model.trf_tensor_to_vec import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
## Load NER pipeline 
nlp = spacy.load("en_core_web_trf")
ent_types = nlp.pipe_labels["ner"]
unwanted_ent_types=['CARDINAL','LANGUAGE','ORDINAL','PERCENT','QUANTITY','TIME']
ent_types = [ent for ent in ent_types if ent not in unwanted_ent_types]

In [4]:
dataset='full'# OR'open_sanctions'# OR 'lilsis'
kb_iteration='_2022_11_07'

In [5]:
data=pd.read_csv(f'../kb_datasets/kb_entities_{dataset}{kb_iteration}.csv',index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
kb_data=data[['id','name','desc']]

In [7]:
def load_entities(kb_data):
    names = dict()
    descriptions = dict()

    for row in kb_data.iterrows():
        qid = str(row[1][0])
        name = str(row[1][1])
        desc = str(row[1][2])
        names[qid] = name
        descriptions[qid] = desc
    
    return names, descriptions

# Call function
name_dict, desc_dict = load_entities(kb_data)

In [8]:
def get_ner_data(qid,doc,ent_types):
    """
    Extract the entity data (text, label, start, end, start_char, end_char) 
    from a Spacy Doc and format into JSON.
    Filter output to only include `ent_types`.
    :returns dict
    """
    ents = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start": ent.start,
            "end": ent.end,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
        }
        for ent in doc.ents
        if ent.label_ in ent_types
    ]
    return {qid:ents}

In [9]:
def setup_logger(name, log_file, level=logging.DEBUG):
    """To setup as many loggers as you want"""

    handler = logging.FileHandler(log_file)        

    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)

    return logger

In [10]:
# create file handler which logs even debug messages
cycle_logger=setup_logger('ner_cycle_logger','ner_cycle_execution_times.log')
# create file handler which logs failed ner
failed_ner_logger=setup_logger('failed_ner','failed_ner.log')

In [11]:
def days_hours_minutes(td):
    hours = td.seconds//3600
    minutes=(td.seconds//60)%60
    seconds=td.seconds
    return f'{hours}:{minutes}:{seconds}' 

In [None]:
# Embbed and export kb descriptions
cycle_logger.debug('------')
cycle_logger.debug('ner run start')
cycle_logger.debug('------')

start_runtime=dt.datetime.now()
num_ner=0
runtime_log=start_runtime.strftime('%Y_%m_%d_%H:%M:%S')
log_str=f'Number of ner:{num_ner}, Runtime:{runtime_log}'
cycle_logger.debug(log_str)
num_ner = 0
descriptions_ner = dict()
export_cycle_interval=10000
filename='ner_dict_pkl.pickle'
for qid, desc in desc_dict.items():
    try:
        desc_doc = nlp(desc)
        ent_dict=get_ner_data(qid,desc_doc, ent_types)
        descriptions_ner[qid]=ent_dict[qid]
    except:
        failed_ner_logger.debug(f'{qid}')
        continue
    num_ner+=1 
    if num_ner%export_cycle_interval==0:
        # Export ner
        with open(f'{filename}', 'wb') as handle:
            pickle.dump(descriptions_ner, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        # Logging
        current_runtime = dt.datetime.now()
        runtime_log = current_runtime.strftime('%Y_%m_%d_%H:%M:%S')
        time_delta = current_runtime - start_runtime 
        time_delta = days_hours_minutes(time_delta)
        log_str=f'Number of ner:{num_ner}, Runtime:{runtime_log}, Timedelta:{time_delta}'
        cycle_logger.debug(log_str)
        
        
cycle_logger.debug('------')
cycle_logger.debug('ner run end')
cycle_logger.debug('------')

Token indices sequence length is longer than the specified maximum sequence length for this model (626 > 512). Running this sequence through the model will result in indexing errors
