# Data preparation for relation classification

This script aims to transform our graph triples into training files that can be ingested by the DataProcessors required for this repo. The files created are especially meant to be used for relation classification purposes. Because we don't want to use our NER training sentences, we are going to use the object and the subject of a triple in a next sentence prediction task (<s>object</s></s>subject</s>). The goal here will be to classify the relation between the two segments rather than telling if they are simply related. We hope it's going to help clustering realted triples together in the embedding space.

In [27]:
# Load graph triples
with open("data/custom_taxo/clinia_triples.spo", "r") as g:
    graph = {}
    entities = set()
    for line in g:
        try:
            subj, relation, obj = line.strip().split("\t")
        except:
            print("Bad formatting, skipping.")
        
        entities.add(subj)
        entities.add(obj)
        if subj not in graph.keys():
            graph[subj] = {}
            graph[subj]["relations"] = [(relation, obj)]
        else:
            graph[subj]["relations"].append((relation, obj))

Bad formatting, skipping.
Bad formatting, skipping.


In [22]:
import random

# Add disjoint triples for negative sampling - FROM RANDOM WORDS

with open("data/random_words/words.txt", "r") as f:
    words = set()
    for line in f:
        w = line.strip().lower()
        if w not in entities:
            words.add(w)

words = list(words) # Sets are not subscriptable

n_triples = 1000 # since we use 2 relations for each subj and we have about 5000 triples
disjoint_triples = {}
for _ in range(n_triples):
    subj = random.choice(words)
    obj1 = random.choice(words)
    obj2 = random.choice(words)
    
    if subj != obj1 and subj != obj2 and obj1 != obj2:
        disjoint_triples[subj] = {}
        disjoint_triples[subj]["relations"] = [("disjoint with", obj1), ("disjoint with", obj2)]

    
# Add disjoint triples to graph
graph = {**graph, **disjoint_triples}

In [28]:
import random

# Add disjoint triples for negative sampling - FROM TAXONOMY INSTANCES

def verify_triple_existance(subj, obj, graph, reccursion_depth=0):
    reccursion_depth += 1
    if reccursion_depth == 3:
        return False
    elif subj not in graph.keys() and obj not in graph.keys():
        # search the whole graph if obj and subj are related by a common subject
        for s, rel in graph.items():
            rel_obj = {item[1] for item in rel["relations"] if item[0] != "disjoint with"}
            if obj in rel_obj and subj in rel_obj:
                return True
        return False
    elif subj not in graph.keys():
            # Check if obj has subj as an object
            return verify_triple_existance(obj, subj, graph, reccursion_depth)
    elif subj == obj:
        return True
    else:
        rel_obj = {item[1] for item in graph[subj]["relations"] if item[0] != "disjoint with"}
        if obj not in rel_obj and verify_triple_existance(obj, subj, graph, reccursion_depth):
            # Check wether obj is within objecs of that subject AND that the opposite triple either doesent exist OR doesnt contain the opposite triple
            return False
        else:
            return True

g = {"test": {"relations":[("rel", "bbb"),("rel", "aaa")]},"ccc": {"relations":[("rel", "ccc"),("disjoint with", "ddd")]}}

print(verify_triple_existance("test", "ccc", g, 0))

words = list(entities) # Sets are not subscriptable

n_triples = 2000 # since we use 2 relations for each subj and we have about 5000 triples
disjoint_triples = {}
for _ in range(n_triples):
    subj, obj1, obj2 = random.sample(words, 3)
    
    # Verify that the triples dont exist
    exist_1 = verify_triple_existance(subj, obj1, graph)
    exist_2 = verify_triple_existance(subj, obj2, graph)
    exist_3 = verify_triple_existance(obj1, obj2, graph)

    if not exist_1 and not exist_2 and not exist_3:
        if subj not in graph.keys():
            graph[subj] = {}
            graph[subj]["relations"] = [("disjoint with", obj1), ("disjoint with", obj2)]
        else:
            graph[subj]["relations"].append(("disjoint with", obj1))
            graph[subj]["relations"].append(("disjoint with", obj2))


    


False


## Statistics

## predicates

In [29]:
# List relations
pred_set = set()
for _, relations in graph.items():
    for predicate, obj in relations["relations"]:
        pred_set.add(predicate)

print(list(pred_set))

['range', 'alt label', 'can be service given by institution organization', 'can be person doing intervention', 'is condition treated by institution organization', 'inverse of', 'treats', 'is drug treats condition', 'is discipline treats sign symptom', 'is technology used to perform treatment', 'abbreviation', 'is anatomy affected by condition', 'is partof', 'is condition treated by discipline', 'has primary target population', 'hidden label', 'domain', 'is institution organization treating condition', 'is condition treated by intervention', 'is intervention done by institution organization', 'scope', 'is discipline treats condition', 'disjoint with', 'is sign symptom of condition', 'can be condition treated by institution organization', 'can have target population', 'comment', 'is subspecialty of discipline', 'is discipline of subspecialty', 'is service given by institution organization', 'can be institution organization treating condition', 'is intervention done by person', 'is drug p

In [30]:
from collections import Counter

# predicat distribution analysis
pred_set = list()
for _, relations in graph.items():
    for predicate, obj in relations["relations"]:
        pred_set.append(predicate)
        
results = dict(Counter(pred_set))

results = dict(sorted(results.items(), key=lambda item: item[1],reverse=True))
total = sum([n for key, n in results.items()])

print(total)
print(results)


4903
{'disjoint with': 1110, 'alt label': 613, 'is condition treated by intervention': 481, 'treats': 481, 'is condition treated by discipline': 355, 'is discipline treats condition': 355, 'is condition treated by drug': 196, 'is drug treats condition': 195, 'can be person doing intervention': 158, 'can be intervention done by person': 158, 'hidden label': 138, 'is subspecialty of discipline': 105, 'is discipline of subspecialty': 105, 'is person of discipline': 74, 'is discipline of person': 74, 'is drug prescribed by person': 60, 'abbreviation': 48, 'is condition treated by institution organization': 30, 'is institution organization treating condition': 30, 'is sign symptom of condition': 21, 'domain': 20, 'has primary target population': 17, 'range': 15, 'inverse of': 11, 'can be condition treated by institution organization': 10, 'pref label': 10, 'can be institution organization treating condition': 10, 'is anatomy affected by condition': 6, 'is discipline treats sign symptom': 2,

## objects

In [None]:
from collections import Counter

# predicat distribution analysis
obj_set = list()
for _, relations in graph.items():
    for predicate, obj in relations["relations"]:
        obj_set.append(obj)
        
results = dict(Counter(obj_set))

results = dict(sorted(results.items(), key=lambda item: item[1],reverse=True))
total = sum([n for key, n in results.items()])

print(total)
print(results)

# Exports

In [21]:
import pandas as pd

# Instances set for visualization
instance_set = set()
for subj, relations in graph.items():
    instance_set.add(subj)
    for predicate, obj in relations["relations"]:
        instance_set.add(obj)
        
df = pd.DataFrame(instance_set)

df.to_csv("tensorboard/data/facets/all/name.csv", header=None, index=False)

In [31]:
import json
import random

# Shuffle triples
keys = list(graph.keys())
random.shuffle(keys)

shuffled_graph = {k:graph[k] for k in keys}

# Split triples into train, dev and test sets
graph_splits = dict()
splits = [0.9, 0.1, 0.0]
graph_splits["train"] = dict(list(shuffled_graph.items())[: int(len(shuffled_graph) * splits[0])])
graph_splits["dev"] = dict(list(shuffled_graph.items())[int(len(shuffled_graph) * splits[0]) : int(len(shuffled_graph) * (splits[0] + splits[1]))])
graph_splits["test"] = dict(list(shuffled_graph.items())[int(len(shuffled_graph) * (splits[0] + splits[1])) :])


# Export files
task = "mse"
for name, graph in graph_splits.items():
    print(len(graph))
    with open("data/graph_data/{}_data/{}.json".format(task,name), "w") as f:
        json.dump(graph, f)

808
90
0
