# Data Preprocessing of Anatomy Track to fit GCN-Align Model


In this notebook, the anatomy dataset is preprocessed to fit for the GCN-Align model.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
import os
import sys

In [None]:
q = """
    SELECT ?label  ?p
    WHERE {
        ?p rdf:type owl:Class .

        ?p rdfs:label ?label .
    }
"""

Special versions of general preprocessing code to enable read out of specifics

In [None]:
# Main
def create_pyg_data(kg_file_one, kg_file_two, alignment_file, query):

    edge_index_one, resource_position_map_one, edge_type_one, properties_position_map_one, rdflib_graph_one, resource_position_map_clone_one = parse_kg_file(kg_file_one, query)
    edge_index_two, resource_position_map_two, edge_type_two, properties_position_map_two, rdflib_graph_two, resource_position_map_clone_two = parse_kg_file(kg_file_two, query)

    alignment, onto1, onto2, extension = parse_mapping_from_file(alignment_file)

    left_indices_list = []
    right_indices_list = []
    for src, tgt, rel, conf in alignment:
        left_indices_list.append(resource_position_map_one[src])
        right_indices_list.append(resource_position_map_two[tgt])
    

    left_indices = torch.tensor(left_indices_list)
    right_indices = torch.tensor(right_indices_list)


    return resource_position_map_one, resource_position_map_two, rdflib_graph_one, rdflib_graph_two, resource_position_map_clone_one, resource_position_map_clone_two, left_indices_list, right_indices_list


In [None]:
def parse_kg_file(kg_file, query):
    rdflib_graph = Graph()
    rdflib_graph.parse(kg_file)

    resources = set()
    properties = set()
    for subj, pred, obj in rdflib_graph:
        if type(subj) is URIRef:
            resources.add(subj.toPython())
        if type(pred) is URIRef:
            properties.add(pred.toPython())
        if type(obj) is URIRef:
            resources.add(obj.toPython())

    keys = []
    vals = []
    for r in rdflib_graph.query(q):
      keys.append(r["p"].toPython())
      vals.append(r["label"].toPython())
    labels_dict = dict(zip(keys, vals))

    properties_position_map = dict((element, index) for (index, element) in enumerate(properties))
    resource_position_map = dict((element, index) for (index, element) in enumerate(resources))

    resource_position_map_clone = resource_position_map.copy()
    resource_position_map_clone.update(labels_dict)

    edge_index_list = []
    edge_type_list = []
    for subj, pred, obj in rdflib_graph:
        if type(subj) is URIRef and type(obj) is URIRef:
            edge_index_list.append([resource_position_map[subj.toPython()], resource_position_map[obj.toPython()]])
            edge_type_list.append(properties_position_map[pred.toPython()])

    edge_index_tensor = torch.tensor(edge_index_list, dtype=torch.long)
    edge_type = torch.tensor(edge_type_list, dtype=torch.long)

    edge_index = edge_index_tensor.t().contiguous()

    #Sentences are encoded by calling model.encode()
    real_labels = resource_position_map_clone.values()
    embeddings = model.encode([str(i) for i in list(real_labels)])
    return edge_index, resource_position_map, edge_type, properties_position_map, rdflib_graph, resource_position_map_clone

In [None]:
download_url("http://oaei.webdatacommons.org/tdrs/testdata/persistent/anatomy_track/anatomy_track-default/suite/mouse-human-suite/component/source/", "./a_source.rdf")
download_url("http://oaei.webdatacommons.org/tdrs/testdata/persistent/anatomy_track/anatomy_track-default/suite/mouse-human-suite/component/target/", "./a_target.rdf")
download_url("http://oaei.webdatacommons.org/tdrs/testdata/persistent/anatomy_track/anatomy_track-default/suite/mouse-human-suite/component/reference/", "./a_ref.rdf")

In [None]:
resource_position_map_one, resource_position_map_two, graph_one, graph_two, clone_one, clone_two, left_indices, right_indices = create_pyg_data("./a_source.rdf", "./a_target.rdf", "./a_ref.rdf", q)

In [None]:
base_path = "/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Baseline/data_anatomy/"

In [None]:
# 1. Data

#resource_position_map_one_rev = dict((element, index) for (element, index) in enumerate(resource_position_map_one))
#resource_position_map_two_rev = dict((element, index) for (element, index) in enumerate(resource_position_map_two))

#with open(base_path + 'ent_ids_2.txt', 'w') as f: 
#    for key, value in resource_position_map_two_rev.items(): 
#        f.write('%s         %s\n' % (key, value))

In [None]:
# 2. Data

#with open(base_path + 'ref_ent_ids.txt', 'w') as f: 
#    for key, value in zip(left_indices, right_indices):
#       f.write('%s         %s\n' % (key, value))

In [None]:
attrs = {}

In [None]:
for subj, pred, obj in graph_two:
  if subj.n3() in attrs:
    if not isinstance(attrs[subj.n3()], list):
      attrs[subj.n3()] = [attrs[subj.n3()]]
      attrs[subj.n3()].append(pred.n3())
  # das muss man erstmal mit unten laufen lassen
  #attrs.update({subj.n3(): pred.n3()})

In [None]:
# 3. attrs
#with open(base_path + 'training_attrs_2.txt', 'w') as f:
#  for key, value in attrs.items():
#    f.write('%s \t %s\n' % (key, "\t ".join( e for e in value )))

In [None]:
# 4. triples of connections
q = """
    SELECT distinct ?r ?p ?o
    WHERE {
        ?r ?p ?o.
    }
"""

In [None]:
resource_position_map_one_rev = dict((element, index) for (element, index) in enumerate(resource_position_map_one))

In [None]:
# creation of look up table for edges
distinct = []
for r in graph_two.query(q):
  distinct.append(r["p"].toPython())

In [None]:
d_graph_two = dict.fromkeys(el for el in distinct)
d_graph_two.update((k, i) for i, k in enumerate(d_graph_two))

In [None]:
triple_list_two = []
for r in graph_two.query(q):
  try:
    res = resource_position_map_two[r["r"].toPython()]
    pred = d_graph_two[r["p"].toPython()]
    obj = resource_position_map_two[r["o"].toPython()]
    triple_list_two.append([res, pred, obj])
  except:
    pass


In [None]:
#with open(base_path + 'triples_2.txt', 'w') as f:
#  for value in triple_list_two:
#    f.write('%s \n' % ("\t ".join( str(e) for e in value )))

In [None]:
att_1 = "/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Baseline/data_anatomy/training_attrs_1.txt"
att_2 = "/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Baseline/data_anatomy/training_attrs_2.txt"

In [None]:
with open(att_2) as f:
     contents2 = f.readlines()

In [None]:
attrs_1 = [el.replace("<", "").replace(">", "").replace("\t", "").replace('\n', '') for el in contents]

In [None]:
attrs_2 = [el.replace("<", "").replace(">", "").replace("\t", "").replace('\n', '') for el in contents2]

In [None]:
with open(att_1, 'w') as f:
  for val in attrs_1:
    f.write(val+ '\n')

In [None]:
with open(att_2, 'w') as f:
  for val in attrs_2:
    f.write(val + '\n')