# Using all the Drugs in the drugbank_smiles.txt file. Ignoring the Infer_drugs.tsv file. 

### Get the mapping of SMILES and DrugBankID

In [113]:
# read drugbank_smiles.txt
import numpy as np
import pandas as pd
import csv
from io import StringIO 
import torch as th

file_path = "../drugbank_info/drugbank_smiles.txt"

with open(file_path, 'r') as file:
    content = file.read()

df = pd.read_csv(StringIO(content), 
                 delimiter='\t', 
                 header=None, 
                 names=["DrugBankID", "SMILES"])

# Add the 'Compound::' prefix to 'DrugBankID' values in the DataFrame
# to match the drug_entity dictionary later
df['DrugBankID'] = 'Compound::' + df['DrugBankID']

df.head()

Unnamed: 0,DrugBankID,SMILES
0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...


In [114]:
# Get the number of available drugs in the drug bank
df.shape

(8807, 2)

There are 8807 compounds in the `drugbank_smiles.txt` file.

NOTE: Not all the drugs here are in the `infer_drug.tsv` file. E.g `DB14708, DB00114, DB00117, DB00119, DB00120, DB00121, ... `
The `infer_drug.tsv` only has drugs with a molecular weight >= 250.

## Get the embeddings of each SMILES

In [115]:
# Load entity file
entity_idmap_file = '../data/embed/entities.tsv'

# Get all the drugs on a list 
drug_list = list(df['DrugBankID'])

In [116]:
# Get drug name to entity ID mappings
entity_map = {}

with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
 
# handle the ID mapping
drug_ids = []
drug_entity = {}  # mapping for drugs to entity for drugs in the DRKG only 
# disease_ids = []
for drug in drug_list:
    drug_entity.update({drug:entity_map[drug]})
    drug_ids.append(entity_map[drug])

In [117]:
# preview the dictionary
print({k: drug_entity[k] for k in list(drug_entity)[:5]})

# get the length of the dictionary
print(len(drug_entity))

{'Compound::DB00006': 5209, 'Compound::DB00007': 4880, 'Compound::DB00014': 5751, 'Compound::DB00027': 5992, 'Compound::DB00035': 3767}
8807


There are 8807 drugs in the `drug_entity` mapping. All the drugs have been mapped.

In [118]:
# Adding the 'entity_id' column based on the 'DrugBankID' column and 'drug_entity' dictionary
df['entity_id'] = df['DrugBankID'].map(drug_entity)

# Handle NaN values and convert 'entity_id' column to integers and 
# df['entity_id'] = df['entity_id'].fillna(-1).astype(int)
df.head()

Unnamed: 0,DrugBankID,SMILES,entity_id
0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,5209
1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,4880
2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,5751
3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,5992
4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,3767


In [120]:
# Check for missing values (NaN) in the 'entity_id' column
# df.query('entity_id == -1')
df[df['entity_id'].isna()]

Unnamed: 0,DrugBankID,SMILES,entity_id


There are no missing values.

In [121]:
len(df)

8807

In [122]:
drug_ids[:5]

[5209, 4880, 5751, 5992, 3767]

The `drug_ids` list values are in the same order as the `entity_id` column in our dataframe.
It will be easy to concatenate the embedding of each drug to the dataframe based on the `entity_id.`

In [123]:
# Load embedding
entity_emb = np.load('../data/embed/DRKG_TransE_l2_entity.npy')

drug_ids = th.tensor(drug_ids).long()
drug_emb = th.tensor(entity_emb[drug_ids])

In [124]:
# view the entity embeddings
entity_emb

array([[-0.5888741 , -0.0872229 , -0.45159575, ...,  0.6187399 ,
        -0.51274306,  0.43644685],
       [ 0.03043327, -0.9927482 ,  0.31472597, ..., -0.03531377,
        -0.7016256 , -0.07200253],
       [-0.15260178, -0.31713626, -0.3052066 , ..., -0.5843045 ,
        -0.5971722 ,  0.4716604 ],
       ...,
       [ 0.58301383,  0.69486225,  0.5986209 , ...,  0.58480304,
         0.5156303 ,  0.27735808],
       [ 0.7441235 , -0.58081806,  0.3950408 , ...,  0.56884754,
        -0.6283229 ,  0.5594293 ],
       [-0.6101573 , -0.4708205 , -0.4374642 , ...,  0.42074913,
        -0.56538516,  0.621319  ]], dtype=float32)

In [125]:
# view only the drug embeddings, which is a subset of the entity embeddings
drug_emb

tensor([[-0.6593, -0.3442, -0.5783,  ..., -0.3604, -0.3599, -0.8203],
        [-0.6070, -0.7813, -0.7301,  ..., -0.8533,  0.5457, -0.8361],
        [-0.4703, -0.8852, -0.6240,  ..., -0.8166,  0.4163, -0.7209],
        ...,
        [-0.5615, -0.5053,  0.5269,  ...,  0.4496,  0.6366,  0.4223],
        [-0.4920,  0.4862,  0.4493,  ..., -0.2605, -0.3269,  0.4799],
        [-0.6385, -0.7726,  0.4875,  ..., -0.1991, -0.8575, -0.0739]])

In [126]:
# Convert the tensor to a Pandas DataFrame
emb_df = pd.DataFrame(drug_emb.numpy(), columns=[f'embedding_{i}' for i in range(400)])

# Concatenate the new DataFrame with the existing DataFrame
result_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

# Display the updated DataFrame
result_df.head(10)

Unnamed: 0,DrugBankID,SMILES,entity_id,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,5209,-0.659347,-0.34423,-0.578348,-0.670077,0.073021,-0.844866,-0.529099,...,0.72216,-0.361777,-0.069529,0.719951,0.765324,-0.828312,-0.750704,-0.360401,-0.359896,-0.820253
1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,4880,-0.606968,-0.781302,-0.730112,-0.868258,0.363371,-0.38829,0.217138,...,-0.355017,0.657896,0.039386,-0.395858,-0.218164,-0.540272,-0.603087,-0.853275,0.545669,-0.836144
2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,5751,-0.470256,-0.885203,-0.623956,-0.507102,0.395201,-0.298908,-0.156826,...,-0.430036,0.563329,0.419391,-0.074537,0.266462,-0.631508,-0.637233,-0.816603,0.416263,-0.720862
3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,5992,-0.761337,-0.709398,0.665223,-0.388625,0.218644,-0.416196,0.594226,...,-0.721291,0.630432,0.817134,0.515158,-0.247925,-0.523338,-0.827212,-0.751571,-0.640211,-0.756264
4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,3767,-0.764784,-0.956786,-0.445434,-0.611626,0.335249,0.044306,-0.63537,...,-0.204678,-0.404126,-0.018485,-0.401169,0.025806,-0.664728,-0.856459,-0.417736,-0.386456,-0.626406
5,Compound::DB00050,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,4859,-0.206116,-0.367292,-0.480315,0.382492,0.504192,0.708871,0.240045,...,-0.511589,0.747597,0.400254,0.194416,0.203462,0.487021,-0.775396,-0.353806,0.344386,0.167657
6,Compound::DB00067,NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2...,10105,-0.275259,-0.808304,-0.382585,-0.565152,0.82827,0.004429,-0.826167,...,-0.413575,0.973626,0.424665,0.37542,0.316805,-0.501161,-0.641856,-0.065996,-0.023391,-0.530696
7,Compound::DB00080,CCCCCCCCCC(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)...,16021,-0.678526,-0.618681,-0.737121,-0.78562,0.611996,-0.747987,-0.398941,...,0.437294,0.716189,0.261704,-0.385356,0.431755,-0.745035,-0.770744,-0.5866,0.510544,-0.602282
8,Compound::DB00091,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,2731,-0.653151,-0.427437,-0.10987,-0.486755,0.364613,-0.239797,-0.405046,...,0.358028,0.648655,0.44127,0.355643,0.707476,-0.580351,-0.714352,-0.63775,-0.176923,-0.722639
9,Compound::DB00093,NCCCC[C@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]1CSSC[...,6827,-0.581973,-0.943471,-0.79321,-0.217045,0.467487,0.776753,-0.572274,...,-0.950761,0.272478,0.236321,0.546112,-0.49018,-0.131556,-0.983097,0.131967,0.284314,-0.562612


In [127]:
# Now remove the entity_id column
result_df.drop('entity_id', inplace=True, axis=1)
result_df.head()

Unnamed: 0,DrugBankID,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,Compound::DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,-0.659347,-0.34423,-0.578348,-0.670077,0.073021,-0.844866,-0.529099,-0.797582,...,0.72216,-0.361777,-0.069529,0.719951,0.765324,-0.828312,-0.750704,-0.360401,-0.359896,-0.820253
1,Compound::DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,-0.606968,-0.781302,-0.730112,-0.868258,0.363371,-0.38829,0.217138,-0.640213,...,-0.355017,0.657896,0.039386,-0.395858,-0.218164,-0.540272,-0.603087,-0.853275,0.545669,-0.836144
2,Compound::DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,-0.470256,-0.885203,-0.623956,-0.507102,0.395201,-0.298908,-0.156826,-0.697836,...,-0.430036,0.563329,0.419391,-0.074537,0.266462,-0.631508,-0.637233,-0.816603,0.416263,-0.720862
3,Compound::DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,-0.761337,-0.709398,0.665223,-0.388625,0.218644,-0.416196,0.594226,-0.497919,...,-0.721291,0.630432,0.817134,0.515158,-0.247925,-0.523338,-0.827212,-0.751571,-0.640211,-0.756264
4,Compound::DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,-0.764784,-0.956786,-0.445434,-0.611626,0.335249,0.044306,-0.63537,-0.706563,...,-0.204678,-0.404126,-0.018485,-0.401169,0.025806,-0.664728,-0.856459,-0.417736,-0.386456,-0.626406


In [128]:
result_df.shape

(8807, 402)

In [129]:
# check for missing values
result_df.isna().sum().sum()

0

In [92]:
# save to a CSV file
# result_df.to_csv("smiles_embeddings_no_infer.csv")