In [1]:
from helper_classes import PYKE
from helper_classes import Parser
from helper_classes import DataAnalyser
from helper_classes import PPMI

import util as ut
import numpy as np

In [2]:
random_state = 1
np.random.seed(random_state)

# Learning DBpedia Embedings with PYKE

1. Download the following files: 
+ http://downloads.dbpedia.org/2016-10/core/skos_categories_en.ttl.bz2
+ http://downloads.dbpedia.org/2016-10/core/instance_types_en.ttl.bz2
+ http://downloads.dbpedia.org/2016-10/core/mappingbased_objects_en.ttl.bz2  

2. Locate the file under KGs/DBpedia.

In [3]:
# DEFINE MODEL PARAMS
K = 45
num_of_dims = 50
bound_on_iter = 30
omega = 0.45557
e_release = 0.0414

In [4]:
kg_root = 'KGs/DBpedia'
kg_path = kg_root + '/'

In [5]:
storage_path, experiment_folder = ut.create_experiment_folder()

parser = Parser(p_folder=storage_path, k=K)

parser.set_similarity_measure(PPMI)

model = PYKE()

analyser = DataAnalyser(p_folder=storage_path)
# For the illustration purpusoes lets only process first 5000 ntriples from each given file.
# To reproduce  reported results => parser.pipeline_of_preprocessing(kg_path)
holder = parser.pipeline_of_preprocessing(kg_path,bound=5000)



###### Preprocessing  starts ######


###### Constructing Inverted Index  starts ######
Number of RDF triples: 15000
Number of vocabulary terms:  11884
Number of subjects:  5620
Constructing Inverted Index  took  0.3225545883178711  seconds



###### Calculation of PPMIs  starts ######
Calculation of PPMIs  took  0.9045906066894531  seconds

Preprocessing  took  1.2310194969177246  seconds



In [6]:
vocab_size = len(holder)

embeddings = ut.randomly_initialize_embedding_space(vocab_size, num_of_dims)

learned_embeddings = model.pipeline_of_learning_embeddings(e=embeddings,
                                                           max_iteration=bound_on_iter,
                                                           energy_release_at_epoch=e_release,
                                                           holder=holder, omega=omega)


learned_embeddings.to_csv(storage_path + '/PYKE_50_embd.csv')

# To use memory efficiently
del holder
del embeddings



###### Generating Embeddings:  starts ######
EPOCH:  0
EPOCH:  1
EPOCH:  2
EPOCH:  3
EPOCH:  4
EPOCH:  5
EPOCH:  6
EPOCH:  7
EPOCH:  8
EPOCH:  9
EPOCH:  10
EPOCH:  11
EPOCH:  12
EPOCH:  13
EPOCH:  14
EPOCH:  15
EPOCH:  16
EPOCH:  17
EPOCH:  18
EPOCH:  19
EPOCH:  20
EPOCH:  21
EPOCH:  22
EPOCH:  23
EPOCH:  24

 Epoch:  24
System energy: -0.03499999999999983
Generating Embeddings:  took  26.945087432861328  seconds



In [7]:
type_info = ut.deserializer(path=storage_path, serialized_name='type_info')
len(type_info)# denoted as \mathcal{S} in the paper 

5620

In [8]:
# get the index of objects / get type information =>>> s #type o
all_types = sorted(set.union(*list(type_info.values())))
len(all_types)# denoted as C in the paper

185

In [9]:
vocabulary = ut.deserializer(path=storage_path, serialized_name='vocabulary')
for i in all_types:
    print(vocabulary[i])

http://www.w3.org/2002/07/owl#Thing
http://dbpedia.org/ontology/Disease
http://dbpedia.org/ontology/AdministrativeRegion
http://dbpedia.org/ontology/OfficeHolder
http://dbpedia.org/ontology/TimePeriod
http://dbpedia.org/ontology/Book
http://dbpedia.org/ontology/Award
http://dbpedia.org/ontology/Film
http://dbpedia.org/ontology/Person
http://dbpedia.org/ontology/PersonFunction
http://dbpedia.org/ontology/Scientist
http://dbpedia.org/ontology/Philosopher
http://dbpedia.org/ontology/Writer
http://dbpedia.org/ontology/Country
http://dbpedia.org/ontology/TennisPlayer
http://dbpedia.org/ontology/Song
http://dbpedia.org/ontology/Sound
http://dbpedia.org/ontology/Organisation
http://dbpedia.org/ontology/ArtificialSatellite
http://dbpedia.org/ontology/WrittenWork
http://dbpedia.org/ontology/Continent
http://dbpedia.org/ontology/BodyOfWater
http://dbpedia.org/ontology/MilitaryUnit
http://dbpedia.org/ontology/MilitaryPerson
http://dbpedia.org/ontology/AcademicJournal
http://dbpedia.org/ontology/M

# Type predictions

In [12]:
analyser.perform_clustering_quality(learned_embeddings)



###### Cluster Quality  starts ######


###### Pseudo labeling via HDBSCAN  starts ######
Pseudo labeling via HDBSCAN  took  0.4727797508239746  seconds

##### CLUSTER 0  #####
##### CLUSTER 5  #####
##### CLUSTER -1  #####
##### CLUSTER 1  #####
##### CLUSTER 4  #####
##### CLUSTER 3  #####
##### CLUSTER 2  #####
Mean of cluster purity 0.6182767433469329
Cluster Quality  took  940.7444860935211  seconds



In [11]:
analyser.perform_type_prediction(learned_embeddings)



###### Type Prediction  starts ######
K values: [1, 3, 5, 10, 15, 30, 50, 100]
##### 1 ####
Mean type prediction [0.97811388]
##### 3 ####
Mean type prediction [0.70523623]
##### 5 ####
Mean type prediction [0.58358647]
##### 10 ####
Mean type prediction [0.58121887]
##### 15 ####
Mean type prediction [0.42255307]
##### 30 ####
Mean type prediction [0.32960913]
##### 50 ####
Mean type prediction [0.23986849]
##### 100 ####
Mean type prediction [0.19941738]
Type Prediction  took  8.651156663894653  seconds

