In [1]:
from helper_classes import PYKE
from helper_classes import Parser
from helper_classes import DataAnalyser
from helper_classes import PPMI

import util as ut
import numpy as np

In [2]:
random_state = 1
np.random.seed(random_state)

# Learning DBpedia Embedings with PYKE

1. Download drugbank.nq.gz from http://download.bio2rdf.org/#/release/4/drugbank/
2. Extract drugbank.nq and locate the file under KGs/Drugbank.

In [3]:
# DEFINE MODEL PARAMS
K = 45
num_of_dims = 50
bound_on_iter = 30
omega = 0.45557
e_release = 0.0414

In [4]:
kg_root = 'KGs/Drugbank'
kg_path = kg_root + '/'

In [6]:
# As Drugbank is serializedin N-Quads format
ut.triple=4

In [7]:
storage_path, experiment_folder = ut.create_experiment_folder()

parser = Parser(p_folder=storage_path, k=K)

parser.set_similarity_measure(PPMI)

model = PYKE()

analyser = DataAnalyser(p_folder=storage_path)


# For the illustration purposes, one can consider only the first 1000 ntriples
#holder = parser.pipeline_of_preprocessing(kg_path,bound=10000)
# To reproduce the reported results
holder = parser.pipeline_of_preprocessing(kg_path)




###### Preprocessing  starts ######


###### Constructing Inverted Index  starts ######
Number of RDF triples: 3146309
Number of vocabulary terms:  521363
Number of subjects:  421121
Constructing Inverted Index  took  41.95017957687378  seconds



###### Calculation of PPMIs  starts ######
Calculation of PPMIs  took  112.18523621559143  seconds

Preprocessing  took  154.30482602119446  seconds



In [8]:
vocab_size = len(holder)

embeddings = ut.randomly_initialize_embedding_space(vocab_size, num_of_dims)

learned_embeddings = model.pipeline_of_learning_embeddings(e=embeddings,
                                                           max_iteration=bound_on_iter,
                                                           energy_release_at_epoch=e_release,
                                                           holder=holder, omega=omega)


# To use memory efficiently
del holder
del embeddings

# To save learned embeddings
#learned_embeddings.to_csv(storage_path + '/PYKE_50_embd.csv')



###### Generating Embeddings:  starts ######
EPOCH:  0
EPOCH:  1
EPOCH:  2
EPOCH:  3
EPOCH:  4
EPOCH:  5
EPOCH:  6
EPOCH:  7
EPOCH:  8
EPOCH:  9
EPOCH:  10
EPOCH:  11
EPOCH:  12
EPOCH:  13
EPOCH:  14
EPOCH:  15
EPOCH:  16
EPOCH:  17
EPOCH:  18
EPOCH:  19
EPOCH:  20
EPOCH:  21
EPOCH:  22
EPOCH:  23
EPOCH:  24

 Epoch:  24
System energy: -0.03499999999999983
Generating Embeddings:  took  691.3569419384003  seconds



### Number of resourses with Types

In [9]:
type_info = ut.deserializer(path=storage_path, serialized_name='type_info')
len(type_info)# denoted as \mathcal{S} in the paper 

421121

### Number of Types

In [10]:
# get the index of objects / get type information =>>> s #type o
all_types = sorted(set.union(*list(type_info.values())))
len(all_types)# denoted as C in the paper.

102

### Types

In [11]:
vocabulary = ut.deserializer(path=storage_path, serialized_name='vocabulary')
for i in all_types:
    print(vocabulary[i])

http://www.w3.org/2002/07/owl#DatatypeProperty
http://www.w3.org/2002/07/owl#ObjectProperty
http://bio2rdf.org/drugbank_vocabulary:Resource
http://www.w3.org/2000/01/rdf-schema#Resource
http://bio2rdf.org/drugbank_vocabulary:Drug
http://www.w3.org/2002/07/owl#Class
http://bio2rdf.org/drugbank_vocabulary:Biotech
http://bio2rdf.org/cas_vocabulary:Resource
http://bio2rdf.org/drugbank_vocabulary:Indication
http://bio2rdf.org/drugbank_vocabulary:Pharmacodynamics
http://bio2rdf.org/drugbank_vocabulary:Mechanism-of-action
http://bio2rdf.org/drugbank_vocabulary:Toxicity
http://bio2rdf.org/drugbank_vocabulary:Absorption
http://bio2rdf.org/drugbank_vocabulary:Half-life
http://bio2rdf.org/drugbank_vocabulary:Route-of-elimination
http://bio2rdf.org/drugbank_vocabulary:Volume-of-distribution
http://bio2rdf.org/drugbank_vocabulary:Clearance
http://bio2rdf.org/drugbank_vocabulary:Group
http://bio2rdf.org/drugbank_vocabulary:Category
http://bio2rdf.org/drugbank_vocabulary:Drug-Classification-Category


# Apply Type Prediction Evaluation

In [12]:
analyser.perform_type_prediction(learned_embeddings)



###### Type Prediction  starts ######
K values: [1, 3, 5, 10, 15, 30, 50, 100]
##### 1 ####
Mean type prediction [0.60998426]
##### 3 ####
Mean type prediction [0.59381187]
##### 5 ####
Mean type prediction [0.56270888]
##### 10 ####
Mean type prediction [0.51363177]
##### 15 ####
Mean type prediction [0.48338622]
##### 30 ####
Mean type prediction [0.43191687]
##### 50 ####
Mean type prediction [0.3957968]
##### 100 ####
Mean type prediction [0.35052167]
Type Prediction  took  7669.856120109558  seconds



#  Implementation of the type prediction scenario
```python
@performance_debugger('Type Prediction')
def perform_type_prediction(self, df):

    def create_binary_type_vector(t_types, a_types):
        vector = np.zeros(len(all_types))
        i = [a_types.index(_) for _ in t_types]
        vector[i] = 1
        return vector

    def create_binary_type_prediction_vector(t_types, a_types):
        vector = np.zeros(len(all_types))
        i = [a_types.index(_) for _ in itertools.chain.from_iterable(t_types)]
        vector[i] += 1
        return vector

    # get the types. Mapping from the index of subject to the index of object
    type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info')

    # get the index of objects / get type information =>>> s #type o
    all_types = sorted(set.union(*list(type_info.values())))


    # Consider only points with type infos.
    e_w_types = df.loc[list(type_info.keys())]

    neigh = NearestNeighbors(n_neighbors=101, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit(
        e_w_types)

    # Get similarity results for selected entities
    df_most_similars = pd.DataFrame(neigh.kneighbors(e_w_types, return_distance=False))

    # Reindex the target
    df_most_similars.index = e_w_types.index.values

    # As sklearn implementation of kneighbors returns the point itself as most similar point
    df_most_similars.drop(columns=[0], inplace=True)


    # Map back to the original indexes. KNN does not consider the index of Dataframe.
    mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values))
    # The values of most similars are mapped to original vocabulary positions
    df_most_similars = df_most_similars.applymap(lambda x: mapper[x])


    k_values = [1, 3, 5, 10, 15, 30, 50, 100]

    print('K values:',k_values)
    for k in k_values:
        print('#####', k, '####')
        similarities = list()
        for _, S in df_most_similars.iterrows():
            true_types = type_info[_]
            type_predictions = [type_info[_] for _ in S.values[:k]]

            vector_true = create_binary_type_vector(true_types, all_types)
            vector_prediction = create_binary_type_prediction_vector(type_predictions, all_types)

            sim = cosine(vector_true, vector_prediction)
            similarities.append(1 - sim)

        report = pd.DataFrame(similarities)
        print('Mean type prediction', report.mean().values)
            
```

# Apply Cluster Purity Evaluation

In [None]:

analyser.perform_clustering_quality(learned_embeddings)

#  Implementation of the cluster purity scenario

```python

@performance_debugger('Cluster Quality')
def perform_clustering_quality(self, df):

    def create_binary_type_vector(t_types, a_types):
        vector = np.zeros(len(all_types))
        i = [a_types.index(_) for _ in t_types]
        vector[i] = 1
        return vector

    type_info = ut.deserializer(path=self.p_folder, serialized_name='type_info')

    # get all unique types, i.e. all o : (s,#type,o) \in KG
    all_types = sorted(set.union(*list(type_info.values())))

    # get only those resources that have type information
    df_only_subjects = df.loc[list(type_info.keys())]

    # Apply clustering
    df_only_subjects = self.pseudo_label_HDBSCAN(df_only_subjects, min_cluster_size=26, min_samples=29)

    clusters = pd.unique(df_only_subjects.labels)

    sum_purity = 0
    for c in clusters:

        valid_indexes_in_c = df_only_subjects[df_only_subjects.labels == c].index.values

        sum_of_cosines = 0

        print('##### CLUSTER', c, ' #####')

        for i in valid_indexes_in_c:

            # returns a set of indexes
            types_i = type_info[i]

            vector_type_i = create_binary_type_vector(types_i, all_types)

            for j in valid_indexes_in_c:
                types_j = type_info[j]
                vector_type_j = create_binary_type_vector(types_j, all_types)

                sum_of_cosines += 1 - cosine(vector_type_i, vector_type_j)

        purity = sum_of_cosines / (len(valid_indexes_in_c) ** 2)

        sum_purity += purity

    mean_of_scores = sum_purity / len(clusters)
    print('Mean of cluster purity', mean_of_scores)

```