# Postprocessing

In this notebook, matches are generated through sentence similarity.

## Imports & Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
!pip install sentence-transformers
!pip install rdflib
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

! pip install torchmetrics

In [None]:
!pip install jellyfish

In [None]:
import torch.nn.functional as F
import os
import torch
import numpy as np
import random


from typing import List, Callable

from torch import Tensor, device
import pandas as pd
import pickle
from rdflib import Graph, Literal, URIRef


In [None]:
os.chdir("/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Scratch/all_in/data")
use_case = "anatomy"
a_or_b = "case_a"
data = torch.load(use_case +  "/" + a_or_b + ".pt")

In [None]:
PATH = "/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Scratch/all_in/MAGNET/results/trainings/GCN/"

In [None]:
with open(PATH + "matchesV13_case_a", "rb") as fp:   # Unpickling
    matches_model = pickle.load(fp)

In [None]:
final_matches = matches_model[998]

## Functions

In [None]:
def _create_graph(file):
    graph = Graph()
    return graph.parse(file)

## String Equivalence Finder

In [None]:
# Data 
data_path = "/content/gdrive/My Drive/1_Studium/2_Master/Master Thesis/2_Projekt/Data/knowledge_graphs/"

In [None]:
res_map_two = data.res_map_two.copy()

In [None]:
res_map_two_to_keep = {k:v for k,v in res_map_two.items() if  'NCI' in k}

#### Plan:

##### 1. Get every label of every element in res_map and save it so it is connected to the right index

In [None]:
g_source = _create_graph(data_path + "anatomy/source.xml")
g_target = _create_graph(data_path + "anatomy/target.xml")

In [None]:
labels = []

In [None]:
for subj, pred, obj in g_target:
  if obj.toPython().lower() == "areola":
    print(subj)

http://human.owl#genid6441
http://human.owl#NCI_C12368
http://human.owl#genid6440


In [None]:
attribute_map_source = {}
attribute_map_target = {}

In [None]:
preds = set()

In [None]:
# In case of running again - check at what index the label is
for subj, pred, obj in g_source:
  preds.add(pred)

In [None]:
list(preds)

[rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#someValuesFrom'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#savedBy'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDate'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#onProperty'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasAlternativeId'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDbXref'),
 rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasDefaultNamespace')]

In [None]:
for subj, pred, obj in g_source:
  if pred == list(preds)[0]:
    try:
      attribute_map_source.update({data.res_map_one[subj.toPython()]: obj.toPython()})
    except:
      pass

In [None]:
for subj, pred, obj in g_target:
  if pred == list(preds)[0]:
    try:
      attribute_map_target.update({data.res_map_two[subj.toPython()]: obj.toPython()})
    except:
      pass

In [None]:
attribute_map_target_processed = {k: attribute_map_target[k] for k in list(res_map_two_to_keep.values())}

In [None]:
len(attribute_map_target_processed)

3298

##### 2. Clean the text to make it easier to find equivalence

In [None]:
import re

In [None]:
# source
#attribute_map_source_clean = {key: re.sub(r"[^a-zA-Z0-9 ]", " ", str(item)) for key, item in attribute_map_source.items()}
#attribute_map_source_clean_lower = {key: item.lower() for key, item in attribute_map_source_clean.items()}

In [None]:
attribute_map_source_lower = {key: item.lower() for key, item in attribute_map_source.items()}

In [None]:
# target
#attribute_map_target_clean = {key: re.sub(r"[^a-zA-Z0-9 ]", " ", str(item)) for key, item in attribute_map_target.items()}
#attribute_map_target_clean_lower = {key: item.lower() for key, item in attribute_map_target_clean.items()}

In [None]:
attribute_map_target_lower = {key: item.lower() for key, item in attribute_map_target_processed.items()}

#### 3. Matchmaker

In [None]:
# Function to get string similarity

In [None]:
import jellyfish

In [None]:
jellyfish.levenshtein_distance('foot', 'f')

3

Check how it works

In [None]:
matches = []

for key_source, value_source in attribute_map_source_lower.items():
  for key_target, value_target in attribute_map_target_lower.items():
    if jellyfish.levenshtein_distance(value_source, value_target) <= 1:
      matches.append([key_source, key_target])


In [None]:
len(matches)

748

In [None]:
final_matches_reduced = [item[0:2] for item in final_matches]

In [None]:
match_ticker = 0
for l in final_matches_reduced:
  if l in matches:
    #matches.remove(l)
    match_ticker = match_ticker + 1

In [None]:
match_ticker

573

In [None]:
len(final_matches_reduced)

1768

In [None]:
merged = final_matches_reduced.copy()

In [None]:
for m in matches:
  if m not in final_matches_reduced:
    merged.append(m)

In [None]:
len(merged)

1943

In [None]:
len(matches)

748

In [None]:
all_in_all = final_matches_reduced.append(matches)

In [None]:
final_matches_reduced

In [None]:
len(merged)

1943

In [None]:
gold_standard_val = list(map(list, zip(data.val_set_left.tolist(), data.val_set_right.tolist())))

In [None]:
gold_standard_test = list(map(list, zip(data.test_set_left.tolist(), data.test_set_right.tolist())))

In [None]:
gold_standard = list(map(list, zip(data.left_indices.tolist(), data.right_indices.tolist())))

In [None]:
gold_standard

In [None]:
def calc_prec_rec(matches_all, gold_standard):
    # if match contains member of gold standard, we count it in our performance
    match_count=0
    false_positives = 0

    for match in matches_all:
        if any(match[0] in sublist for sublist in gold_standard) or any(match[1] in sublist for sublist in gold_standard):
            if match in gold_standard:
                match_count = match_count + 1 # einf wenn es passt
            else:
                false_positives = false_positives + 1 # wenn er was macht, was aber nicht stimmt

    false_negatives = len(gold_standard) - match_count # alle matches, die er nicht gefunden hat
    print(false_positives)
    try:
        prec =  match_count / (match_count + false_positives) # wenn das gering ist, wählt er zu viele aus
    except:
        prec = 0
    recall = match_count / (match_count + false_negatives) # wenn das gering ist, wählt er nicht genügend aus?, wenn hoch, dann wählt er viele von der tatsächlichen Menge aus
    return prec, recall

In [None]:
prec, recall = calc_prec_rec(final_matches_reduced, gold_standard_val)

88


In [None]:
prec

0.453416149068323

In [None]:
recall

0.48026315789473684

In [None]:
prec, recall = calc_prec_rec(final_matches_reduced, gold_standard)

576


In [None]:
prec

0.6579572446555819

In [None]:
recall

0.7308707124010554

In [None]:
prec, recall = calc_prec_rec(final_matches_reduced, gold_standard_val)

In [None]:
# mit 0 diff und no preprocessing
prec, recall = calc_prec_rec(merged, gold_standard_val)

99


In [None]:
prec, recall = calc_prec_rec(final_matches_reduced, gold_standard_val)

In [None]:
prec

0.5194174757281553

In [None]:
recall

0.7039473684210527

In [None]:
prec, recall = calc_prec_rec(merged, gold_standard)

609


In [None]:
prec

0.6711663066954644

In [None]:
recall

0.8199208443271768

## LookUp for Match Up (Evaluation)


In [None]:
for subj, pred, obj in g_target:
  if subj.toPython() in partner:
    print(obj)

http://human.owl#genid6995
A primitive, undifferentiated blood cell which can undergo division and will give rise to a cell in the basophil lineage.
Receptor Cell
Body of Penis
http://www.w3.org/2002/07/owl#Class
Stapedius Muscle
Left_Ovary
http://human.owl#NCI_C12404
