# Data prepping

This notebook aims to transform our NER data to the appropriate format for K-Adapter, in particular the factual adapter. The linguistic adapter makes much more sense in the paper because they are working with general knowledge. 

In the paper they used Wikidata for the factual adapter, and BookCorpus for the linguistic adapter. The wikidata they use contains 2 (related) entities per input sentence. Unfortunately, we probably will not have such data, but we could however force inject related entities together in a sentence. Or at least try!


In [278]:
import pandas as pd 
from ast import literal_eval

name = "dev"
ner_data = pd.read_csv(f"data/ner_data/ser_bus_spc/en/{name}/data.csv", index_col=0)
ner_data["text"] = ner_data["text"].apply(lambda row: literal_eval(row))
ner_data["tag"] = ner_data["tag"].apply(lambda row: literal_eval(row))


In [279]:
ner_data.iloc[1]["text"]

['in',
 'humans',
 'methemoglobin',
 'formation',
 'is',
 'very',
 'rare',
 'at',
 'therapeutic',
 'doses',
 'and',
 'overdoses',
 'of',
 'acetaminophen']

In [280]:
def create_lookup_table(spo_path:str, predicate:bool = False):
    lookup_table = {}
    print("[KnowledgeGraph] Loading spo from {}".format(spo_path))
    with open(spo_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                subj, pred, obje = line.strip().split("\t")    
            except:
                print("[KnowledgeGraph] Bad spo:", line)
            if predicate:
                value = pred + " " + obje
            else:
                value = obje
            if subj in lookup_table.keys():
                lookup_table[subj].append(value)
            else:
                lookup_table[subj] = list([value])
    return lookup_table

In [281]:
clinia_kg = create_lookup_table(spo_path="data/custom_taxo/clinia_kg.spo", predicate = False)

[KnowledgeGraph] Loading spo from data/custom_taxo/clinia_kg.spo


In [282]:
import numpy as np
def get_entity_pos(tags):
    entities_pos = []
    cont = 0
    i_max = len(tags)
    for i,tag in enumerate(tags):
        if tag !="O" and cont == 0:
            index = i # remember the
            if i == (i_max -1):
                # case where entity is one word long and at the end
                entities_pos.append((index, index + cont))
            cont+=1
        elif cont !=0 and tag !="O" and i != (i_max-1):
            # continuing an entity case
            cont+=1
        elif cont !=0 and tag == "O":
            # standard case
            entities_pos.append((index, index + cont -1))
            cont = 0
        elif cont!=0 and i == (i_max-1):
            # case where entity is more than one word long and at the end
            entities_pos.append((index, index + cont))
    return entities_pos



In [283]:
import re
with open("data/custom_taxo/graph_triples.nt", 'r', encoding="utf-8") as g:
    graphs = {}
    for line in g:
        try:
            subj, relation, parent = re.findall(r'"(.*?)"', line)
        except:
            print("Bad formatting, skipping.")

        if subj not in graphs.keys():
            graphs[subj] = {}
            graphs[subj]["relations"] = [(relation, parent)]
        else:
            graphs[subj]["relations"].append((relation, parent))
        

In [284]:
ner_data["entity_pos"] = ner_data["tag"].apply(lambda row: get_entity_pos(row))

In [285]:
import random
def process_data(df:pd.DataFrame, taxo:dict):
    examples = []
    for _, row in df.iterrows():
        exs = []
        for ent_pos in row["entity_pos"]:
            ex = {}
            ex["token"] = row["text"]
            ex["subj_start"] = ent_pos[0] # we are not guarenteed to have a subject object relationship in a single sentence
            ex["subj_end"] = ent_pos[1]

            entity = " ".join(ex["token"][ent_pos[0]: ent_pos[1]+1])
            #relation = taxo[entity]["relation"] # need relation label of course
            try:
                relation, parent = random.choice(taxo[entity]["relations"]) # 
            except KeyError:
                relation = "no_relation"
                parent = "no_parent"
            ex["relation"] = relation


            #related_entity = taxo[entity]["related_entity"] # 
            #ex["token"].insert(ent_pos[0]+1, parent) # TODO: what if the parent has multiple words??
            ex["token"][ent_pos[1]+1:ent_pos[1]+1] = parent.split()
            ex["obj_start"] = ent_pos[1] + 1
            ex["obj_end"] = ent_pos[1] + len(parent.split())
            #ex["obj_start"] = 0
            #ex["obj_end"] = 0
            #coudld have labels for subj and obj
            # we probably need to add a mask / visibility matrix for this last part of input we added, like in KBERT,
            # because we do not want to corrupt the sentence contextual representation with it. It probably will not make much sense. 
            exs.append(ex)
        examples.extend(exs)
    return examples


In [286]:
import json
ner_data_copy = ner_data.copy(deep = True)
exs = process_data(ner_data_copy, graphs)
with open(f"data/ner_data/custom_data/{name}.json", "w") as f:
    json.dump(exs, f)

In [287]:
def get_labels(examples):
    labels2id = {}
    i=0
    for entry in examples:
        relation = entry["relation"]
        if relation not in labels2id.keys():
            labels2id[relation] = i
            i+=1
        else:
            continue

    
    id2labels = {v:k for k,v in labels2id.items()}
    return labels2id, id2labels
labels2id, id2labels = get_labels(exs)

In [290]:
list(labels2id.keys())

['no_relation',
 'facet',
 'counseling',
 'anesthesia',
 'treatment',
 'diagnostics',
 'profession',
 'consultation',
 'surgical procedures',
 'childbirth',
 'education',
 'technical act',
 'medical documents',
 'prescription']

In [276]:
with open("data/ner_data/custom_data/relations.json", "w") as f:
    json.dump(labels2id, f)