In [4]:
from sklearn.model_selection import train_test_split

In [13]:
import json
from tqdm import tqdm
from collections import defaultdict

In [7]:
def Degree(file_name):
    with open("Evaluation/"+file_name+".txt", 'r') as file:
        data = file.readlines()
    degree = defaultdict(lambda: 0)
    for triple in data:
        e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
        degree[e1] += 1
        degree[e2] += 1
    return degree

In [16]:
def get_entities(triples):
    entities = set()
    for l in triples:
        e1, _, e2 = l.split("\t")
        entities.update({e1, e2})
    return entities

def get_relations(triples):
    relations = set()
    for l in triples:
        relations.add(l.split('\t')[1])
    return relations

def filter_triples(triples, train, train_entities, train_relations):
    remaining, removed = [], []
    for l in triples: # Ony keep triples whose entities and relations are in the training split
        e1, r, e2 = l.split('\t')
        if e1 in train_entities and e2 in train_entities and r in train_relations:
            remaining.append(l)
        else:
            removed.append(l)
    return train+removed, remaining

def write_to_file(storage_path, data):
    with open(storage_path, 'w') as file:
        file.writelines(data)

def split_data(file_name, low_degree=3):
    with open("Evaluation/"+file_name+".txt", 'r') as file:
        data = file.readlines()
        print(f"\nDataset size: {len(data)} triples")
    
    ## Remove low degree entities
    degrees = Degree(file_name)
    new_data = []
    for triple in tqdm(data, desc="Removing entities with degree less than 3..."):
        e1, _, e2 = triple.strip("\n").strip(" .").split("\t")
        if degrees[e1] >= low_degree and degrees[e2] >= low_degree:
            new_data.append(triple)
    del data
    data = new_data
    print(f"\nDataset size without low degree entities: {len(data)} triples\n")
    train, temp_test = train_test_split(data, test_size=0.2, random_state=42)
    train_entities = get_entities(train)
    train_relations = get_relations(train)
    
    train, test = filter_triples(temp_test, train, train_entities, train_relations)
    
    print(f"\nStatistics train: {100*float(len(train))/len(data)}%,  test: {100*float(len(test))/len(data)}%")
    storage_paths = [f'Evaluation/splits/{file_name}_train.txt', f'Evaluation/splits/{file_name}_test.txt']
    for path, d in zip(storage_paths, [train, test]):
        write_to_file(path, d)
    print('\nData split completed!')
    

def split_merge(Wiki_triples_train, DBpedia_triples_train):
    with open(f"Evaluation/splits/{Wiki_triples_train}.txt") as file2:
        data_Wiki_train = file2.readlines()
    with open(f"Evaluation/splits/{DBpedia_triples_train}.txt") as file4:
        data_DBp_train = file4.readlines()
    with open("Evaluation/splits/Merged_train.txt", "w") as file_merge_train:
        file_merge_train.writelines(data_Wiki_train+data_DBp_train)
    print("\n Merge split completed!\n")
    print(f"Train: {len(data_Wiki_train)+len(data_DBp_train)}")

In [14]:
#split_data("Wikidata")


Dataset size: 1466131 triples


Removing entities with degree less than 3...: 100%|██████████| 1466131/1466131 [00:01<00:00, 849151.53it/s]



Dataset size without low degree entities: 289575 triples


Statistics train: 81.43451610118277%,  test: 18.56548389881723%

Data split completed!


In [15]:
#split_data("DBpedia")


Dataset size: 275170 triples


Removing entities with degree less than 3...: 100%|██████████| 275170/275170 [00:00<00:00, 606946.30it/s]



Dataset size without low degree entities: 85041 triples


Statistics train: 81.92166131630626%,  test: 18.078338683693747%

Data split completed!


In [27]:
#split_merge("Wikidata_train","DBpedia_train")


 Merge split completed!

Train: 305481


In [17]:
with open("Evaluation/splits/DBpedia_train.txt") as file:
    db_train = file.readlines()
Ents = set()
for triple in db_train:
    e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
    Ents.update({e1, e2})


In [21]:
sameAs = list(filter(lambda x: '/entity/' in x, Ents))

In [23]:
len(sameAs)

22102

In [24]:
len(Ents)

31116

In [25]:
with open("Evaluation/splits/Wikidata_train.txt") as file:
    db_train = file.readlines()
Ents = set()
for triple in db_train:
    e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
    Ents.update({e1, e2})


In [26]:
len(Ents)

72058

In [36]:
import numpy as np

In [28]:
from collections import defaultdict

In [39]:
def degree(kg):
    with open(f"Evaluation/splits/{kg}/train.txt") as file:
        data_train = file.readlines()
    print("***Train*** Number of triples: ", len(data_train))
    degree_train = defaultdict(lambda: 0)
    for triple in data_train:
        e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
        degree_train[e1] += 1
        degree_train[e2] += 1
    
    with open(f"Evaluation/splits/{kg}/test.txt") as file:
        data_test = file.readlines()
    if not "Merge" in kg:
        print("***Test*** Number of triples: ", len(data_test))
    degree_test = defaultdict(lambda: 0)
    for triple in data_test:
        e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
        degree_test[e1] += 1
        degree_test[e2] += 1
    return degree_train, degree_test

In [42]:
def kg_size(kg):
    with open(f"Evaluation/splits/{kg}/train.txt") as file:
        data_train = file.readlines()
    with open(f"Evaluation/splits/{kg}/test.txt") as file:
        data_test = file.readlines()
    E = set()
    R = set()
    for triple in data_train:
        e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
        E.update({e1, e2})
        R.add(r)
    print(f"***Train*** #Entities: {len(E)}, #Relations: {len(R)}")
    
    E = set()
    R = set()
    for triple in data_test:
        e1, r, e2 = triple.strip("\n").strip(" .").split("\t")
        E.update({e1, e2})
        R.add(r)
    if not "Merge" in kg:
        print(f"***Test*** #Entities: {len(E)}, #Relations: {len(R)}")

In [38]:
degrees_train, degrees_test = degree("DBpedia")
kg_size("DBpedia")
train_avg_dg = np.array(list(degrees_train.values())).mean()
test_avg_dg = np.array(list(degrees_test.values())).mean()
print("Train avg. degree: ", train_avg_dg)
print("Test avg. degree: ", test_avg_dg)

***Train*** Number of triples:  69667
***Test*** Number of triples:  15374
***Train*** #Entities: 31116, #Relations: 392
***Test*** #Entities: 15602, #Relations: 279
Train avg. degree:  4.477889188841753
Test avg. degree:  1.970772977823356


In [40]:
degrees_train, degrees_test = degree("Wikidata")
kg_size("Wikidata")
train_avg_dg = np.array(list(degrees_train.values())).mean()
test_avg_dg = np.array(list(degrees_test.values())).mean()
print("Train avg. degree: ", train_avg_dg)
print("Test avg. degree: ", test_avg_dg)

***Train*** Number of triples:  235814
***Test*** Number of triples:  53761
***Train*** #Entities: 72058, #Relations: 707
***Test*** #Entities: 41137, #Relations: 465
Train avg. degree:  6.545116433983735
Test avg. degree:  2.6137540413739457


In [43]:
degrees_train, _ = degree("Merge_test_DBpedia")
kg_size("Merge_test_DBpedia")
train_avg_dg = np.array(list(degrees_train.values())).mean()
print("Train avg. degree: ", train_avg_dg)

***Train*** Number of triples:  305481
***Train*** #Entities: 81836, #Relations: 1099
Train avg. degree:  7.465687472505987


In [25]:
def sameAs(kg, split_type="train"):
    with open(f"Evaluation/splits/{kg}/{split_type}.txt") as file:
        data = file.readlines()
    sameAs_ents = set()
    for line in data:
        e1, r, e2 = line.strip("\n").strip(" .").split("\t")
        if "/entity/" in e1:
            sameAs_ents.add(e1)
        if "/entity/" in e2:
            sameAs_ents.add(e2)
    print("Number of entities with sameAs links:", len(sameAs_ents))

In [26]:
sameAs("DBpedia")

Number of entities with sameAs links: 22102


In [27]:
sameAs("DBpedia", "test")

Number of entities with sameAs links: 10471
