# Data prepping for Relation classification and Entity Typing

This notebook aims to transform our NER data to the appropriate format for K-Adapter, in particular the factual adapter. The linguistic adapter makes much more sense in the paper because they are working with general knowledge. 

In the paper they used Wikidata for the factual adapter, and BookCorpus for the linguistic adapter. The wikidata they use contains 2 (related) entities per input sentence. Unfortunately, we probably will not have such data, but we could however force inject related entities together in a sentence. Or at least try!

The data for entity typing is relatively similar, but we simply add a label for each entity. Since we will be using our NER data, the labels will be BUS, SER, SPC.


In [36]:
import pandas as pd 
from ast import literal_eval

name = "dev"
ner_data = pd.read_csv(f"data/ner_data/ser_bus_spc/en/{name}/data.csv", index_col=0)
ner_data["text"] = ner_data["text"].apply(lambda row: literal_eval(row))
ner_data["tag"] = ner_data["tag"].apply(lambda row: literal_eval(row))


In [20]:
def create_lookup_table(spo_path:str, predicate:bool = False):
    lookup_table = {}
    print("[KnowledgeGraph] Loading spo from {}".format(spo_path))
    with open(spo_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                subj, pred, obje = line.strip().split("\t")    
            except:
                print("[KnowledgeGraph] Bad spo:", line)
            if predicate:
                value = pred + " " + obje
            else:
                value = obje
            if subj in lookup_table.keys():
                lookup_table[subj].append(value)
            else:
                lookup_table[subj] = list([value])
    return lookup_table

In [9]:
clinia_kg = create_lookup_table(spo_path="data/custom_taxo/clinia_kg.spo", predicate = False)

[KnowledgeGraph] Loading spo from data/custom_taxo/clinia_kg.spo


In [37]:
import numpy as np
def get_entity_pos(tags):
    entities_pos = []
    cont = 0
    i_max = len(tags)
    for i,tag in enumerate(tags):
        if tag !="O" and cont == 0:
            index = i # remember the
            if i == (i_max -1):
                # case where entity is one word long and at the end
                entities_pos.append((index, index + cont))
            cont+=1
        elif cont !=0 and tag !="O" and i != (i_max-1):
            # continuing an entity case
            cont+=1
        elif cont !=0 and tag == "O":
            # standard case
            entities_pos.append((index, index + cont -1))
            cont = 0
        elif cont!=0 and i == (i_max-1):
            # case where entity is more than one word long and at the end
            entities_pos.append((index, index + cont))
    return entities_pos



In [38]:
import re
with open("data/custom_taxo/graph_triples.nt", 'r', encoding="utf-8") as g:
    graphs = {}
    for line in g:
        try:
            subj, relation, parent = re.findall(r'"(.*?)"', line)
        except:
            print("Bad formatting, skipping.")

        if subj not in graphs.keys():
            graphs[subj] = {}
            graphs[subj]["relations"] = [(relation, parent)]
        else:
            graphs[subj]["relations"].append((relation, parent))
        

In [39]:
ner_data["entity_pos"] = ner_data["tag"].apply(lambda row: get_entity_pos(row))

In [40]:
import random
def process_data(df:pd.DataFrame, taxo:dict):
    examples = []
    for _, row in df.iterrows():
        exs = []
        for ent_pos in row["entity_pos"]:
            ex = {}
            ex["sent"] = " ".join(row["text"])
            ex["subj_start"] = ent_pos[0] # we are not guarenteed to have a subject object relationship in a single sentence
            ex["subj_end"] = ent_pos[1]

            #entity = " ".join(ex["token"][ent_pos[0]: ent_pos[1]+1])
            # #relation = taxo[entity]["relation"] # need relation label of course
            # try:
            #     relation, parent = random.choice(taxo[entity]["relations"]) # 
            # except KeyError:
            #     relation = "no_relation"
            #     parent = "no_parent"
            # ex["relation"] = relation

            #ex["token"][ent_pos[1]+1:ent_pos[1]+1] = parent.split() # uncomment for RC
            #ex["obj_start"] = ent_pos[1] + 1
            #ex["obj_end"] = ent_pos[1] + len(parent.split())

            ex["subj_label"] = re.sub(r"\w\-", "", row["tag"][ent_pos[0]])  # add labrls for ET

            # we probably need to add a mask / visibility matrix for this last part of input we added, like in KBERT,
            # because we do not want to corrupt the sentence contextual representation with it. It probably will not make much sense. 
            exs.append(ex)
        examples.extend(exs)
    return examples

In [33]:
import random
def process_data(df:pd.DataFrame, taxo:dict):
    examples = []
    for _, row in df.iterrows():
        exs = []
        for ent_pos in row["entity_pos"]:
            ex = {}
            ex["token"] = row["text"]
            ex["subj_start"] = ent_pos[0] # we are not guarenteed to have a subject object relationship in a single sentence
            ex["subj_end"] = ent_pos[1]

            entity = " ".join(ex["token"][ent_pos[0]: ent_pos[1]+1])
            #relation = taxo[entity]["relation"] # need relation label of course
            try:
                relation, parent = random.choice(taxo[entity]["relations"]) # 
            except KeyError:
                relation = "no_relation"
                parent = "no_parent"
            ex["relation"] = relation

            #ex["token"][ent_pos[1]+1:ent_pos[1]+1] = parent.split() # uncomment for RC
            #ex["obj_start"] = ent_pos[1] + 1
            #ex["obj_end"] = ent_pos[1] + len(parent.split())

            ex["subj_label"] = re.sub(r"\w\-", "", row["tag"][ent_pos[0]])  # add labrls for ET

            # we probably need to add a mask / visibility matrix for this last part of input we added, like in KBERT,
            # because we do not want to corrupt the sentence contextual representation with it. It probably will not make much sense. 
            exs.append(ex)
        examples.extend(exs)
    return examples

In [41]:
import json
ner_data_copy = ner_data.copy(deep = True)
exs = process_data(ner_data_copy, graphs)
with open(f"data/ner_data/et_data/{name}.json", "w") as f:
    json.dump(exs, f)

In [None]:
def get_labels(examples):
    labels2id = {}
    i=0
    for entry in examples:
        relation = entry["relation"]
        if relation not in labels2id.keys():
            labels2id[relation] = i
            i+=1
        else:
            continue

    
    id2labels = {v:k for k,v in labels2id.items()}
    return labels2id, id2labels
labels2id, id2labels = get_labels(exs)

In [302]:
with open("data/ner_data/custom_data/relations.json", "w") as f:
    json.dump(labels2id, f)

# Entity MLM

Another pretraining task that could be interesting would be a form of masked language model that would mask entire entities instead of single tokens. This could be seen as a form contextual pretraining specialized over our data.

Let's first explore the RoBERTa tokenizer and how we could efficiently mask entities(recall that they vary in # of words).


In [3]:
from pytorch_transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast
import torch
import pandas as pd, re
from ast import literal_eval

name = "dev"
ner_data = pd.read_csv(f"data/ner_data/ser_bus_spc/en/{name}/data.csv", index_col=0)
ner_data["text"] = ner_data["text"].apply(lambda row: literal_eval(row))
ner_data["tag"] = ner_data["tag"].apply(lambda row: literal_eval(row))

with open("data/custom_taxo/graph_triples.nt", 'r', encoding="utf-8") as g:
    graphs = {}
    for line in g:
        try:
            subj, relation, parent = re.findall(r'"(.*?)"', line)
        except:
            print("Bad formatting, skipping.")

        if subj not in graphs.keys():
            graphs[subj] = {}
            graphs[subj]["relations"] = [(relation, parent)]
        else:
            graphs[subj]["relations"].append((relation, parent))

ner_data["entity_pos"] = ner_data["tag"].apply(lambda row: get_entity_pos(row))
    

In [705]:
def process_data(df:pd.DataFrame):
    examples = []
    for _, row in df.iterrows():
        exs = []
        for ent_pos in row["entity_pos"]:
            ex = {}
            ex["token"] = row["text"]
            ex["label"] = row["tag"]
            exs.append(ex)
        examples.extend(exs)
    return examples

In [706]:
import json
exs = process_data(ner_data_copy)
with open(f"data/ner_data/mlm_data/{name}.json", "w") as f:
    json.dump(exs, f)

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large",add_prefix_space=True)

In [5]:
text = ner_data.iloc[3]["text"]
tags = ner_data.iloc[3]["tag"]
inputs = tokenizer(text,return_tensors="pt", is_split_into_words=True, padding="max_length", max_length=32, add_special_tokens=True)

In [594]:
word_ids = inputs.word_ids()
word_ids = torch.tensor([word_id if word_id is not None else -1 for word_id in word_ids])
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

ent_pos = ner_data.iloc[3]["entity_pos"] # coin flip here, with a bonus if there is only one entity.


In [568]:
ent_pos

[(9, 10), (14, 14)]

In [471]:
import torch
mask = torch.rand(input_ids.size()) < 0.15


In [647]:
entity_masks = []
for ent in ent_pos:
    tmp1 = word_ids >= ent[0]
    #print(tmp1)
    tmp2 = word_ids <= ent[1]
    #print(tmp2)
    tmp = torch.logical_and(tmp1, tmp2)
    entity_masks.append(tmp.int())

entity_mask = sum(entity_masks)

In [646]:
entity_masks

[tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)]

In [603]:
new = entity_mask * input_ids

In [608]:
tokenizer.convert_ids_to_tokens(new[new!=0])

['Ġbreastfeeding', 'Ġsupport', 'Ġcounselling']

In [644]:
(input_ids * entity_mask)[:,11:13]

tensor([[24161,   323]])

In [None]:
entity_index = torch.nonzero(entity_mask)

In [678]:
chosen_tensor = (torch.rand(input_ids.size()) < 0.5).squeeze()

In [679]:
torch.logical_and(chosen_tensor, entity_masks[0]).int()

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [680]:
chosen_tensor

tensor([ True, False, False,  True,  True, False,  True, False, False,  True,
        False, False,  True,  True, False,  True, False,  True,  True,  True,
         True, False, False,  True,  True,  True, False,  True, False, False,
         True, False])

In [684]:
if torch.logical_and(chosen_tensor, entity_masks[0]).int().any():
    chosen_tensor[entity_masks[0].bool()] = True

In [685]:
chosen_tensor[entity_masks[0].bool()]

tensor([True, True])

# Named Entity Recognition using their data pipeline



In [12]:
import pandas as pd 
from ast import literal_eval

name = "test"
ner_data = pd.read_csv(f"data/ner_data/ser_bus_spc/en/{name}/data.csv", index_col=0)
ner_data["text"] = ner_data["text"].apply(lambda row: literal_eval(row))
ner_data["tag"] = ner_data["tag"].apply(lambda row: literal_eval(row))

In [13]:
import random
def process_data(df:pd.DataFrame):
    examples = []
    for _, row in df.iterrows():
        ex = {}
        ex["token"] = row["text"]

        #ex["token"][ent_pos[1]+1:ent_pos[1]+1] = parent.split() # uncomment for RC
        #ex["obj_start"] = ent_pos[1] + 1
        #ex["obj_end"] = ent_pos[1] + len(parent.split())

        ex["labels"] = row["tag"]

        # we probably need to add a mask / visibility matrix for this last part of input we added, like in KBERT,
        # because we do not want to corrupt the sentence contextual representation with it. It probably will not make much sense. 
        examples.append(ex)
    return examples

In [14]:
import json
exs = process_data(ner_data)
with open(f"data/ner_data/fine_tuning/{name}.json", "w") as f:
    json.dump(exs, f)