# Generating splits for BioKG

In [None]:
import pykeen
import pandas as pd
import numpy as np
import torch

In [None]:
biokg_data = pd.read_csv('data/biokg_no_benchmark.tsv', sep='\t')

In [None]:
unique_entities = list(set(biokg_data['left'].unique()) | set(biokg_data['right'].unique()))
len(unique_entities)

In [None]:
unique_relations = biokg_data['property'].unique()
unique_relations

In [None]:
unique_entities_mapping = { i : unique_entities[i] for i in range(0, len(unique_entities) ) }

In [None]:
unique_entities_mapping = {v:k for k,v in unique_entities_mapping.items()}

In [None]:
unique_entities_mapping

In [None]:
unique_relations_mapping = { i : unique_relations[i] for i in range(0, len(unique_relations) ) }

In [None]:
unique_relations_mapping = {v:k for k,v in unique_relations_mapping.items()}

In [None]:
unique_relations_mapping

In [None]:
biokg_data['left'] = biokg_data['left'].apply(lambda x: unique_entities_mapping[x])

biokg_data['property'] = biokg_data['property'].apply(lambda x: unique_relations_mapping[x])

biokg_data['right'] = biokg_data['right'].apply(lambda x: unique_entities_mapping[x])

In [None]:
biokg_data

In [None]:
# To numpy
biokg_data_for_pykeen = biokg_data.to_numpy()

# To Tensor
biokg_data_for_pykeen = torch.from_numpy(biokg_data_for_pykeen)

In [None]:
biokg_data_for_pykeen.shape

In [None]:
biokg_data_for_pykeen

In [None]:
test_data = biokg_data[3540:3560]
test_data

In [None]:
from pykeen.triples.triples_factory import CoreTriplesFactory

In [None]:
biokg_factory = CoreTriplesFactory(mapped_triples=biokg_data_for_pykeen,
                                   num_entities=len(unique_entities),
                                   num_relations=len(unique_relations))

In [None]:
biokg_factory = biokg_factory.with_labels(entity_to_id=unique_entities_mapping,
                         relation_to_id=unique_relations_mapping)

In [None]:
biokg_factory.relation_to_id

In [None]:
# Set the ratios for the splits.
split_ratio = [0.7, 0.15, 0.15] # 70% train, 15% each for val/test

In [None]:
training_factory, validation_factory, testing_factory = biokg_factory.split(ratios=split_ratio)

In [None]:
training_factory, validation_factory, testing_factory

### Save the splits 

In [None]:
path = 'data/splits/'

In [None]:
training_factory.to_path_binary(path + 'train.pt')

validation_factory.to_path_binary(path + 'validation.pt')

testing_factory.to_path_binary(path + 'test.pt')