# Testing with the Original Embeddings from the CSV file and the Embeddings from the Morgan Fingerprint model

## COVID-19 Drug Repurposing via disease-compounds relations
This example shows how to do drug repurposing using DRKG even with the pretrained model.

In [1]:
import csv
import numpy as np
import pandas as pd
import sys
import torch as th
import torch.nn.functional as fn

import tensorflow as tf
from tensorflow import keras
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors as rd
from eosce.models import ErsiliaCompoundEmbeddings

2023-12-13 14:57:55.822677: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-13 14:57:56.149083: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-13 14:57:56.149124: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-13 14:57:56.150741: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-13 14:57:56.321514: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-13 14:57:56.322754: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

## Collecting COVID-19 related disease
At the very beginning we need to collect a list of disease of Corona-Virus(COV) in DRKG. We can easily use the Disease ID that DRKG uses for encoding the disease. Here we take all of the COV disease as target.

In [2]:
COV_disease_list = [
'Disease::SARS-CoV2 E',
'Disease::SARS-CoV2 M',
'Disease::SARS-CoV2 N',
'Disease::SARS-CoV2 Spike',
'Disease::SARS-CoV2 nsp1',
'Disease::SARS-CoV2 nsp10',
'Disease::SARS-CoV2 nsp11',
'Disease::SARS-CoV2 nsp12',
'Disease::SARS-CoV2 nsp13',
'Disease::SARS-CoV2 nsp14',
'Disease::SARS-CoV2 nsp15',
'Disease::SARS-CoV2 nsp2',
'Disease::SARS-CoV2 nsp4',
'Disease::SARS-CoV2 nsp5',
'Disease::SARS-CoV2 nsp5_C145A',
'Disease::SARS-CoV2 nsp6',
'Disease::SARS-CoV2 nsp7',
'Disease::SARS-CoV2 nsp8',
'Disease::SARS-CoV2 nsp9',
'Disease::SARS-CoV2 orf10',
'Disease::SARS-CoV2 orf3a',
'Disease::SARS-CoV2 orf3b',
'Disease::SARS-CoV2 orf6',
'Disease::SARS-CoV2 orf7a',
'Disease::SARS-CoV2 orf8',
'Disease::SARS-CoV2 orf9b',
'Disease::SARS-CoV2 orf9c',
'Disease::MESH:D045169',
'Disease::MESH:D045473',
'Disease::MESH:D001351',
'Disease::MESH:D065207',
'Disease::MESH:D028941',
'Disease::MESH:D058957',
'Disease::MESH:D006517'
]

## Treatment relation

Two treatment relations in this context

In [3]:
treatment = ['Hetionet::CtD::Compound:Disease','GNBR::T::Compound:Disease']

## Using Original Embeddings

In [4]:
# Read the input file with the SMILES and original embedding columns
input_df = pd.read_csv('smiles_embeddings_infer_drugs.csv')
drug_smiles = input_df['SMILES']
original_embeddings = input_df.iloc[:, 3:].values
print(original_embeddings)

[[-0.27149197 -0.5939862  -0.37011808 ... -0.50732565  0.15921181
  -0.67021894]
 [-0.4293835  -0.35515165 -0.45263517 ...  0.6304008   0.44173548
  -0.43939406]
 [-0.6724328  -0.2223129  -0.5301088  ...  0.37865484  0.36450392
  -0.3003229 ]
 ...
 [ 0.16583948  0.6799841  -0.5094551  ... -0.6383422  -0.52847636
  -0.60481673]
 [ 0.5265029   0.631999   -0.58314145 ...  0.52151537 -0.53969806
   0.5051691 ]
 [-0.5288267  -0.43796182 -0.6211387  ... -0.58782583 -0.57391274
  -0.54813313]]


In [5]:
gamma = 12.0

def transE_l2(head, rel, tail):
    score = head + rel - tail
    return gamma - th.norm(score, p=2, dim=-1)


def edge_score(embeddings):
    '''Function to calculate the edge scores.

    Argument
    ---------
    embeddings (array). Array of size 400 containing 
            the embeddings of the SMILES molecule.

    Returns
    --------
    scores (tensor). Tensor showing the edge score for 
            each disease based on the drug_embeddings, relation_embeddings,
            and COVID_disease embeddings.
    '''
    
    # Load entity and relation mapping files
    entity_idmap_file = '../data/embed/entities.tsv'
    relation_idmap_file = '../data/embed/relations.tsv'

    # Get drugname/disease name to entity ID mappings
    entity_map = {}
    entity_id_map = {}
    relation_map = {}
    
    with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name', 'id'])
        for row_val in reader:
            entity_map[row_val['name']] = int(row_val['id'])
            entity_id_map[int(row_val['id'])] = row_val['name']

    with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name', 'id'])
        for row_val in reader:
            relation_map[row_val['name']] = int(row_val['id'])

    # Handle the ID mapping
    # drug_ids = [entity_map[smi] for smi in input_df['SMILES']]
    disease_ids = [entity_map[disease] for disease in COV_disease_list]
    treatment_rid = [relation_map[treat] for treat in treatment]

    # Load embeddings
    entity_emb = np.load('../data/embed/DRKG_TransE_l2_entity.npy')
    rel_emb = np.load('../data/embed/DRKG_TransE_l2_relation.npy')

    disease_ids = th.tensor(disease_ids).long()
    treatment_rid = th.tensor(treatment_rid)

    drug_emb = th.tensor(embeddings)
    treatment_embs = [th.tensor(rel_emb[rid]) for rid in treatment_rid]

    scores_per_disease = []
    for rid in range(len(treatment_embs)):
        treatment_emb = treatment_embs[rid]
        for disease_id in disease_ids:
            disease_emb = entity_emb[disease_id]
            score = fn.logsigmoid(transE_l2(drug_emb, treatment_emb, disease_emb))
            scores_per_disease.append(score)
        
    scores = th.cat(scores_per_disease)
    return scores

In [10]:
# Check edge score
scores = edge_score(original_embeddings)
print(scores)

tensor([-4.3944, -4.2768, -4.0842,  ..., -6.8248, -5.9643, -6.4361],
       dtype=torch.float64)


The output at this point matches that in the original code.

In [11]:
# print(len(scores))

## Using Morgan Fingerprint model predicted embeddings

In [12]:
def calculate_morgan_fingerprint(smiles_series, radius=2, n_bits=2048):
    '''Function to convert SMILES to fingerprint using the Morgan Fingerprint for a Pandas Series.

    Parameters
    -----------
    smiles_series (Pandas Series): Series containing SMILES of the compounds.
    radius (int): controls the radius of the fingerprint.
    n_bits (int): controls the length of the fingerprint bit vector.

    Returns
    -------
    fingerprints (NumPy Array): fingerprints of SMILES in the input series
    '''
    fingerprints = []

    for smiles in smiles_series:
        # Convert the input SMILES string into an RDKit molecule object.
        mol = Chem.MolFromSmiles(smiles)
        # If the molecule conversion is successful, then generate the fingerprint
        if mol is not None:
            fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            arr = np.zeros((1,))
            AllChem.DataStructs.ConvertToNumpyArray(fingerprint, arr)
            fingerprints.append(arr)

    return np.array(fingerprints)


morgan_fingerprints = calculate_morgan_fingerprint(drug_smiles)
print("The length of drug smiles is:", len(drug_smiles))
print("The length of morgan_fingerprints is:", len(morgan_fingerprints))
print(morgan_fingerprints)

[15:11:30] Unusual charge on atom 0 number of radical electrons set to zero


The length of drug smiles is: 6521
The length of morgan_fingerprints is: 6521
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
# load the saved model
model = keras.models.load_model("mf_model")

# predict the embeddings of the drug Morgan fingerprints
mf_embeddings = model.predict(morgan_fingerprints)
print(mf_embeddings)

  1/204 [..............................] - ETA: 25s

2023-12-13 15:11:33.026778: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 53420032 exceeds 10% of free system memory.


[[-0.14539145 -0.07305996 -0.28591534 ... -0.17516898  0.1930192
  -0.24724808]
 [-0.19003028 -0.01532836 -0.34017324 ... -0.03350564  0.27475265
  -0.4107109 ]
 [-0.18157184 -0.18642853 -0.53062004 ... -0.19943923  0.17700678
  -0.42576954]
 ...
 [ 0.15702343  0.5273648  -0.18834566 ...  0.09023038 -0.00148031
  -0.07461459]
 [ 0.26712334  0.35497493 -0.05726355 ...  0.06099455  0.00921259
   0.13521963]
 [-0.14691627  0.14291099 -0.08404915 ... -0.0940645   0.15892325
  -0.3037098 ]]


In [14]:
# Check edge score
scores = edge_score(mf_embeddings)
print(scores)

tensor([-2.9399, -3.2392, -3.1096,  ..., -4.1541, -3.6757, -2.6378])


## Using Morgan Fingerprint Count Embeddings

In [15]:
def calculate_morgan_fingerprint_count(smiles_series, radius=2, n_bits=2048):
    '''Function to convert SMILES to fingerprint using the Morgan Fingerprint Count for a Pandas Series.

    Parameters
    -----------
    smiles_series (Pandas Series): Series containing SMILES of the compounds.
    radius (int): controls the radius of the fingerprint.
    n_bits (int): controls the length of the fingerprint bit vector.

    Returns
    -------
    fingerprints (NumPy Array): fingerprints of SMILES in the input series
    '''
    fingerprints = []

    for smiles in smiles_series:
        # Convert the input SMILES string into an RDKit molecule object.
        mol = Chem.MolFromSmiles(smiles)
        # If the molecule conversion is successful, then generate the fingerprint
        if mol is not None:
            fingerprint = rd.GetHashedMorganFingerprint(mol, radius=radius, nBits=n_bits)
            arr = np.zeros((n_bits,), dtype=np.uint8)
            for idx, count in fingerprint.GetNonzeroElements().items():
                arr[idx] = count if count < 255 else 255
            fingerprints.append(arr)

    return np.array(fingerprints)


morgan_fingerprint_counts = calculate_morgan_fingerprint_count(drug_smiles)
print("The length of drug smiles is:", len(drug_smiles))
print("The length of morgan_fingerprint_counts is:", len(morgan_fingerprint_counts))
print(morgan_fingerprint_counts)

[15:11:37] Unusual charge on atom 0 number of radical electrons set to zero


The length of drug smiles is: 6521
The length of morgan_fingerprint_counts is: 6521
[[0 0 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
# load the saved model
model = keras.models.load_model("mfcount_model")

# predict the embeddings of the drug Morgan fingerprints
mfcount_embeddings = model.predict(morgan_fingerprints)
print(mfcount_embeddings)

 27/204 [==>...........................] - ETA: 0s

2023-12-13 15:11:38.760866: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 53420032 exceeds 10% of free system memory.


[[-0.06825772  0.30465627 -0.12158845 ... -0.12277345  0.2011324
  -0.23716313]
 [-0.20824523 -0.01959234 -0.48053306 ...  0.12968855  0.3793604
  -0.48335794]
 [-0.08968617  0.09335519 -0.5797926  ... -0.05975341  0.31130734
  -0.5547119 ]
 ...
 [ 0.20823444  0.6298635  -0.24004918 ...  0.13009495  0.05724981
   0.09389218]
 [ 0.15205142  0.5382722  -0.35446736 ...  0.25447556 -0.01474598
   0.1489524 ]
 [ 0.04268427  0.48600382 -0.48005092 ...  0.08301169  0.08417349
   0.06148876]]


In [17]:
# Check edge score
scores = edge_score(mfcount_embeddings)
print(scores)

tensor([-3.1572, -3.8949, -3.5096,  ..., -4.8646, -3.9417, -4.0792])


## Using Ersilia Embeddings

In [18]:
def calculate_ersilia_fingerprint(smiles_series):
    '''Function to convert SMILES to embedding using the Ersilia Compound Embeddings

    Parameters
    -----------
    smiles_series (Pandas Series): Series containing SMILES of the compounds.
    
    Returns
    -------
    embeddings (NumPy Array): embeddings of SMILES in the input series
    '''
    fingerprints = []

    for smiles in smiles_series:
        model = ErsiliaCompoundEmbeddings()
        embeddings = model.transform([smiles])
        fingerprints.append(embeddings)

    return np.array(fingerprints)


ersilia_descriptor = calculate_ersilia_fingerprint(drug_smiles)
print("The length of drug smiles is:", len(drug_smiles))
print("The length of ersilia descriptor is:", len(ersilia_descriptor))
print(ersilia_descriptor)

[15:15:43] Unusual charge on atom 0 number of radical electrons set to zero


The length of drug smiles is: 6521
The length of ersilia descriptor is: 6521
[[[ 0.0257058  -0.04700303  0.0038386  ... -0.04954001  0.00230189
    0.06074152]]

 [[-0.01488564  0.00200031  0.0076263  ... -0.02522343  0.00974543
   -0.05920968]]

 [[ 0.03319988  0.00829009  0.11105975 ... -0.06268101 -0.05922099
   -0.13830455]]

 ...

 [[ 0.14135769  0.04299419 -0.05294156 ... -0.11926071 -0.04383421
   -0.07094503]]

 [[-0.00320739 -0.07315785  0.02023898 ... -0.00566167 -0.04526901
    0.02813001]]

 [[ 0.05420799  0.01565388 -0.06837106 ... -0.04647554 -0.05509707
    0.00550069]]]


In [21]:
# print(len(ersilia_descriptor[0][0]))

1024


In [22]:
# load the saved model
model = keras.models.load_model("ersilia_model")

# predict the embeddings of the Ersilia descriptor
ersilia_embeddings = model.predict(ersilia_descriptor[0])
print(ersilia_embeddings)

# Check edge score
scores = edge_score(ersilia_embeddings)
print("Edge Scores")
print(scores)

[[ 1.23500451e-02  1.34906679e-01 -4.87938166e-01 -1.68315038e-01
   4.05608594e-01 -8.20174217e-02 -4.66677874e-01 -2.94600576e-01
  -1.19865827e-01  3.35172147e-01  3.57053995e-01  8.99758376e-03
  -3.56139764e-02 -1.46388426e-01  3.75670344e-01  1.07170455e-03
   5.01852930e-01 -2.06135258e-01  2.57657111e-01 -3.19362700e-01
   4.43136811e-01  4.05331671e-01  5.49726188e-01 -4.46288198e-01
   1.07390694e-01 -1.38347358e-01 -3.24173160e-02  5.01109898e-01
  -6.82170168e-02  6.12666197e-02  5.21888509e-02  1.56537339e-01
  -3.29866827e-01 -1.00155145e-01 -2.22854614e-01  2.49571279e-01
   1.44717276e-01 -1.87215786e-02  2.50452906e-01 -4.91102636e-02
  -3.36343706e-01 -2.48707116e-01 -3.81335378e-01 -3.28976691e-01
  -2.20031410e-01 -6.57672733e-02  1.88272700e-01 -1.48133442e-01
  -3.16198796e-01  6.11623488e-02 -3.51283252e-01  1.35894135e-01
  -1.28927175e-02  4.43492413e-01 -1.28066875e-02 -4.86826092e-01
  -3.97345662e-01  2.53090143e-01 -3.20880115e-03 -4.62222248e-01
  -2.87712