In [1]:
import json
from sklearn.model_selection import train_test_split
from helper_classes.experiment import Experiment
from helper_classes.dataloader import CLPDataLoader
import torch
import pandas as pd

In [2]:
def show_tbox_and_abox_size(kb):
    from ontolearn import KnowledgeBase
    import random
    print()
    print('#'*50)
    print(kb.upper())
    print('#'*50)
    with open('./Datasets/'+kb+'/Triples/train.txt') as file:
        triples = file.readlines()

    kb = KnowledgeBase(path='./Datasets/'+kb+'/'+kb+'.owl')
    rels = [rel.get_iri().get_remainder() for rel in kb.ontology().object_properties_in_signature()] +\
    [rel.get_iri().get_remainder() for rel in kb.ontology().data_properties_in_signature()]
    abox, tbox = [], []
    for line in triples:
        try:
            if 'subClassOf' in line.split('\t')[1] or 'equivalentClass' in line.split('\t')[1]:
                tbox.append(line)
            if line.split('\t')[1].split('#')[-1] in rels:
                abox.append(line)
            if 'rdf-syntax-ns#type' in line.split('\t')[1]:
                abox.append(line)
        except IndexError:
            continue
    print()
    print('Example ABox statement: {}'.format(random.choice(abox)))
    print('Example TBox statement: {}'.format(random.choice(tbox)))
    print('\nCardinality of ABox: {}, TBox: {}'.format(len(abox), len(tbox)))

In [3]:
for kb in ['carcinogenesis', 'mutagenesis', 'semantic_bible', 'vicodi']:
    show_tbox_and_abox_size(kb)


##################################################
CARCINOGENESIS
##################################################





Example ABox statement: carcinogenesis#d280_22	22-rdf-syntax-ns#type	carcinogenesis#Hydrogen-3

Example TBox statement: carcinogenesis#Calcium-84	rdf-schema#subClassOf	carcinogenesis#Calcium


Cardinality of ABox: 96757, TBox: 138

##################################################
MUTAGENESIS
##################################################

Example ABox statement: mutagenesis#bond5465	22-rdf-syntax-ns#type	mutagenesis#Bond-1

Example TBox statement: mutagenesis#Methyl	rdf-schema#subClassOf	mutagenesis#RingStructure


Cardinality of ABox: 61965, TBox: 82

##################################################
SEMANTIC_BIBLE
##################################################

Example ABox statement: NTNames#Tryphosa	NTNames#religiousBelief	NTNames#Christianity

Example TBox statement: NTNames#Region	rdf-schema#subClassOf	NTNames#Object


Cardinality of ABox: 3211, TBox: 51

##################################################
VICODI
##################################################

Exam

In [20]:
def load_and_show_stats(kb):
    from collections import Counter
    with open("./Datasets/"+kb+"/Train_data/Data.json", "r") as file:
        data = json.load(file)
    data = list(data.items())
    data = Experiment.remove_minority_problem_types(data)
    Count = Counter([value["target concept length"] for _, value in data])
    
    d = {}
    for l in Count:
        d.setdefault(l, {})
        d[l]={"train": int(round(0.9*0.8*Count[l],0)), "valid": int(round(0.1*0.8*Count[l],0)), "test": int(round(0.2*Count[l],0))}
    print(d)
    return data

### Example on how to use CLPDataLoader

In [2]:
triples_path = "./Datasets/carcinogenesis/Triples/"

data_c = load_and_show_stats("carcinogenesis")

_, test_data_c = train_test_split(data_c, test_size=0.2, random_state=123) # The same random state was used so that we have the same test data as the ones in the paper

path_to_embeddings = "./Embeddings/carcinogenesis/ConEx_entity_embeddings.csv"

entity_embeddings = pd.read_csv(path_to_embeddings).set_index("Unnamed: 0")

dataloader = CLPDataLoader({'random_seed': 1, 'path_to_triples': triples_path})

data_numeric = dataloader.load(entity_embeddings, data=test_data_c, shuffle=False)

Loading data...: 100%|██████████| 1548/1548 [03:55<00:00,  6.57it/s]


In [36]:
data_numeric_c[0][0]

tensor([[ 0.7452,  1.1436,  0.9549,  ...,  1.0308,  1.1996,  1.0000],
        [ 0.5873,  0.4626,  1.0056,  ...,  1.2301,  1.3779,  1.0000],
        [ 0.1044, -0.1323,  0.9416,  ...,  1.5694, -0.4399,  1.0000],
        ...,
        [ 0.7259, -0.1510,  0.2183,  ...,  1.1348, -0.2394, -1.0000],
        [ 0.1590,  0.6640,  0.3740,  ...,  1.4481,  0.1498, -1.0000],
        [ 0.0201, -0.3654,  0.2360,  ...,  1.2629,  0.1151, -1.0000]])

In [4]:
import matplotlib.pyplot as plt
import json

In [27]:
def plot_acc_curves(plt_data1, plt_data2, plt_data3, plt_data4, name1, name2, name3, name4, key, mode, out_file_name, y_axis):

    """
    args:
    ------
    -plt_data: plot data for each KB
    -name: name of KBs (Vicodi, Mutagenesis, Carcinogenesis or Semantic Bible)
    -key: acc or loss
    -mode: train or val
    -out_file_name: file name for saving figures (without extension)
    """
    Markers = ['--', ':', '2', '-']
    Colors = ['g', 'b', 'm', 'c']
    i = 0
    fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(20,5), sharey=True)
    
    #fig, ((ax1, ax2, ax3)) = plt.subplots(1, 3, figsize=(15,5), sharey=True, sharex=True)
    #fig.suptitle('Sharing x per column, y per row')

    for crv in plt_data1[mode][key]:
        mk = Markers[i%4]
        c = Colors[i%4]
        ax1.plot(crv, mk, markersize=6, color=c)
        i += 1
    leg1 = ax1.legend(('GRU', 'LSTM', 'CNN', 'MLP'), prop={'size': 20})
    for line in leg1.get_lines():
        line.set_linewidth(4.0)
    ax1.set_title(name1, fontsize=30, fontweight="bold")
    ax1.set_xlabel('Epochs', fontsize=25)
    ax1.set_ylabel(y_axis, fontsize=25)
    ax1.tick_params(axis='both', which='major', labelsize=20)

    for crv in plt_data2[mode][key]:
        mk = Markers[i%4]
        c = Colors[i%4]
        ax2.plot(crv, mk, markersize=6, color=c)
        i += 1   
    leg2 = ax2.legend(('GRU', 'LSTM', 'CNN', 'MLP'), prop={'size': 20})
    for line in leg2.get_lines():
        line.set_linewidth(4.0)
    ax2.set_title(name2, fontsize=30, fontweight="bold")
    ax2.set_xlabel('Epochs', fontsize=25)
    ax2.tick_params(axis='both', which='major', labelsize=20)

    for crv in plt_data3[mode][key]:
        mk = Markers[i%4]
        c = Colors[i%4]
        ax3.plot(crv, mk, markersize=10, color=c)
        i += 1
    leg3 = ax3.legend(('GRU', 'LSTM', 'CNN', 'MLP'), prop={'size': 20})
    for line in leg3.get_lines():
        line.set_linewidth(4.0)
    ax3.set_title(name3, fontsize=30, fontweight="bold")
    ax3.set_xlabel('Epochs', fontsize=25)
    ax3.tick_params(axis='both', which='major', labelsize=20)

    for crv in plt_data4[mode][key]:
        mk = Markers[i%4]
        c = Colors[i%4]
        ax4.plot(crv, mk, markersize=10, color=c)
        i += 1
    leg4 = ax4.legend(('GRU', 'LSTM', 'CNN', 'MLP'), prop={'size': 20})
    for line in leg4.get_lines():
        line.set_linewidth(4.0)
    ax4.set_xlabel('Epochs', fontsize=25)
    ax4.set_title(name4, fontsize=30, fontweight="bold")
    ax4.tick_params(axis='both', which='major', labelsize=20)

    for ax in fig.get_axes():
        ax.label_outer()
    fig.savefig(out_file_name+".pdf", bbox_inches='tight')
    fig.show()

In [22]:
with open("./Datasets/carcinogenesis/Plot_data/plot_data_with_val.json") as file:
    plot_data_carci = json.load(file)
    
with open("./Datasets/mutagenesis/Plot_data/plot_data_with_val.json") as file:
    plot_data_mut = json.load(file)
    
with open("./Datasets/semantic_bible/Plot_data/plot_data_with_val.json") as file:
    plot_data_sem = json.load(file)
    
with open("./Datasets/vicodi/Plot_data/plot_data_with_val.json") as file:
    plot_data_vic = json.load(file)

In [28]:
plot_acc_curves(plot_data_carci, plot_data_mut, plot_data_sem, plot_data_vic, "Carcinogenesis", "Mutagenesis",\
                "Semantic Bible", "Vicodi", "acc", "train", "train-acc", "Accuracy")

In [29]:
plot_acc_curves(plot_data_carci, plot_data_mut, plot_data_sem, plot_data_vic, "Carcinogenesis", "Mutagenesis",\
                "Semantic Bible", "Vicodi", "acc", "val", "val-acc", "Accuracy")

In [30]:
plot_acc_curves(plot_data_carci, plot_data_mut, plot_data_sem, plot_data_vic, "Carcinogenesis", "Mutagenesis",\
                "Semantic Bible", "Vicodi", "loss", "train", "train-loss", "Loss")

In [31]:
plot_acc_curves(plot_data_carci, plot_data_mut, plot_data_sem, plot_data_vic, "Carcinogenesis", "Mutagenesis",\
                "Semantic Bible", "Vicodi", "loss", "val", "val-loss", "Loss")

### Vicodi KB

In [21]:
data_v = load_and_show_stats("vicodi")

{3: {'train': 3952, 'valid': 439, 'test': 1098}, 5: {'train': 2498, 'valid': 278, 'test': 694}, 6: {'train': 335, 'valid': 37, 'test': 93}, 7: {'train': 3597, 'valid': 400, 'test': 999}, 8: {'train': 747, 'valid': 83, 'test': 207}}


In [24]:
data_s = load_and_show_stats("semantic_bible")

{1: {'train': 33, 'valid': 4, 'test': 9}, 3: {'train': 487, 'valid': 54, 'test': 135}, 5: {'train': 546, 'valid': 61, 'test': 152}, 7: {'train': 162, 'valid': 18, 'test': 45}, 9: {'train': 73, 'valid': 8, 'test': 20}, 11: {'train': 41, 'valid': 5, 'test': 11}}


In [25]:
data_m = load_and_show_stats("mutagenesis")

{3: {'train': 1038, 'valid': 115, 'test': 288}, 5: {'train': 1156, 'valid': 128, 'test': 321}, 7: {'train': 1310, 'valid': 146, 'test': 364}}


### Statistics on the predicted lengths per dataset 

In [11]:
import json

# Carcinogenesis

with open('Datasets/carcinogenesis/Results/concept_learning_results_celoe_clp.json') as file:
    clip_data = json.load(file)
    
with open('Datasets/carcinogenesis/Results/concept_learning_results_celoe.json') as file:
    celoe_data = json.load(file)
F1_clip = clip_data['F-measure']
F1_celoe = celoe_data['F-measure']
Lengths_clip = clip_data['Pred-Length']
Lengths_celoe = celoe_data['Length']

print('\n######### Carcinogenesis ########')
print('Range: ', (min(Lengths_clip), max(Lengths_clip)))
longer_is_better = 0
pruned = 0
for i in range(len(F1_celoe)):
    if F1_clip[i] < F1_celoe[i] and Lengths_celoe[i] > Lengths_clip[i]:
        longer_is_better += 1
    elif F1_clip[i] > F1_celoe[i]:
        pruned += 1
print('Longer concept would have been better: ', longer_is_better)
print('Successfully pruned: ', pruned)


# Mutagenesis
with open('Datasets/mutagenesis/Results/concept_learning_results_celoe_clp.json') as file:
    clip_data = json.load(file)
    
with open('Datasets/mutagenesis/Results/concept_learning_results_celoe.json') as file:
    celoe_data = json.load(file)
    
F1_clip = clip_data['F-measure']
F1_celoe = celoe_data['F-measure']
Lengths_clip = clip_data['Pred-Length']
Lengths_celoe = celoe_data['Length']

print('\n######### Mutagenesis ########')
print('Range: ', (min(Lengths_clip), max(Lengths_clip)))
longer_is_better = 0
pruned = 0
for i in range(len(F1_celoe)):
    if F1_clip[i] < F1_celoe[i] and Lengths_celoe[i] > Lengths_clip[i]:
        longer_is_better += 1
    elif F1_clip[i] > F1_celoe[i]:
        pruned += 1
print('Longer concept would have been better: ', longer_is_better)
print('Successfully pruned: ', pruned)


# Semantic Bible

with open('Datasets/semantic_bible/Results/concept_learning_results_celoe_clp.json') as file:
    clip_data = json.load(file)
    
with open('Datasets/semantic_bible/Results/concept_learning_results_celoe.json') as file:
    celoe_data = json.load(file)
    
F1_clip = clip_data['F-measure']
F1_celoe = celoe_data['F-measure']
Lengths_clip = clip_data['Pred-Length']
Lengths_celoe = celoe_data['Length']

print('\n######### Semantic Bible ########')
print('Range: ', (min(Lengths_clip), max(Lengths_clip)))
longer_is_better = 0
pruned = 0
for i in range(len(F1_celoe)):
    if F1_clip[i] < F1_celoe[i] and Lengths_celoe[i] > Lengths_clip[i]:
        longer_is_better += 1
    elif F1_clip[i] > F1_celoe[i]:
        pruned += 1
print('Longer concept would have been better: ', longer_is_better)
print('Successfully pruned: ', pruned)


# Vicodi

with open('Datasets/vicodi/Results/concept_learning_results_celoe_clp.json') as file:
    clip_data = json.load(file)
    
with open('Datasets/vicodi/Results/concept_learning_results_celoe.json') as file:
    celoe_data = json.load(file)
    
F1_clip = clip_data['F-measure']
F1_celoe = celoe_data['F-measure']
Lengths_clip = clip_data['Pred-Length']
Lengths_celoe = celoe_data['Length']

print('\n######### Vicodi ########')
print('Range: ', (min(Lengths_clip), max(Lengths_clip)))
longer_is_better = 0
pruned = 0
for i in range(len(F1_celoe)):
    if F1_clip[i] < F1_celoe[i] and Lengths_celoe[i] > Lengths_clip[i]:
        longer_is_better += 1
    elif F1_clip[i] > F1_celoe[i]:
        pruned += 1
print('Longer concept would have been better: ', longer_is_better)
print('Successfully pruned: ', pruned)


######### Carcinogenesis ########
Range:  (3, 5)
Longer concept would have been better:  0
Successfully pruned:  41

######### Mutagenesis ########
Range:  (3, 7)
Longer concept would have been better:  2
Successfully pruned:  23

######### Semantic Bible ########
Range:  (1, 11)
Longer concept would have been better:  14
Successfully pruned:  20

######### Vicodi ########
Range:  (3, 7)
Longer concept would have been better:  0
Successfully pruned:  75
