# Data preparation for relation classification

This script aims to transform our graph triples into training files that can be ingested by the DataProcessors required for this repo. The files created are especially meant to be used for relation classification purposes. Because we don't want to use our NER training sentences, we are going to use the object and the subject of a triple in a next sentence prediction task (<s>object</s></s>subject</s>). The goal here will be to classify the relation between the two segments rather than telling if they are simply related. We hope it's going to help clustering realted triples together in the embedding space.

In [1]:
# Load graph triples
with open("data/custom_taxo/clinia_triples.spo", "r") as g:
    graph = {}
    for line in g:
        try:
            subj, relation, parent = line.strip().split("\t")
        except:
            print("Bad formatting, skipping.")

        if subj not in graph.keys():
            graph[subj] = {}
            graph[subj]["relations"] = [(relation, parent)]
        else:
            graph[subj]["relations"].append((relation, parent))

Bad formatting, skipping.
Bad formatting, skipping.


In [2]:
# List relations
pred_set = set()
for _, relations in graph.items():
    for predicate, obj in relations["relations"]:
        pred_set.add(predicate)

print(list(pred_set))

['is person treats condition', 'is discipline of subspecialty', 'disjoint with', 'inverse of', 'is condition treated by discipline', 'is partof', 'is person of discipline', 'feminin', 'alt label', 'type', 'sub class of', 'abbreviation', 'is discipline treats condition', 'is condition treated by person', 'is discipline of person', 'is condition treated by intervention', 'domain', 'treats', 'is sign symptom of condition', 'is discipline treats sign symptom', 'is condition treated by drug', 'scope', 'range', 'is drug treats condition', 'hidden label', 'pref label', 'studies', 'is subspecialty of discipline', 'is anatomy affected by condition']


In [3]:
import json

# Split triples into train, dev and test sets
graph_splits = dict()
splits = [0.8, 0.2, 0.0]
graph_splits["train"] = dict(list(graph.items())[: int(len(graph) * splits[0])])
graph_splits["dev"] = dict(list(graph.items())[int(len(graph) * splits[0]) : int(len(graph) * (splits[0] + splits[1]))])
graph_splits["test"] = dict(list(graph.items())[int(len(graph) * (splits[0] + splits[1])) :])


# Export files
for name, graph in graph_splits.items():
    print(len(graph))
    with open("data/graph_data/rc_data/{}.json".format(name), "w") as f:
        json.dump(graph, f)

1078
270
0
