# Focusing on the drugs in the infer_drug.tsv file

### Get the mapping of SMILES and DrugBankID

In [2]:
# read drugbank_smiles.txt
import numpy as np
import pandas as pd
import csv
from io import StringIO 
import torch as th

file_path = "../drugbank_info/drugbank_smiles.txt"

smiles_mapping = dict()

# Read the text file line by line and populate the dictionary
with open(file_path, 'r') as file:
    for line in file:
        # Split each line into DrugBankID and SMILES using tab as the delimiter
        drugbank_id, smiles = line.strip().split('\t')
        
        # Add the entry to the dictionary
        smiles_mapping[drugbank_id] = smiles

# Display the dictionaries
# print(smiles_mapping)

In [3]:
# Get the number of available drugs in the drug bank
len(smiles_mapping)

8807

There are 8807 compounds in the `drugbank_smiles.txt` file.

NOTE: Not all the drugs here are in the `infer_drug.tsv` file. E.g `DB14708, DB00114, DB00117, DB00119, DB00120, DB00121, ... `
The `infer_drug.tsv` only has drugs with a molecular weight >= 250.

## Get the embeddings of each SMILES

In [4]:
# Load entity file
entity_idmap_file = '../data/embed/entities.tsv'

# Get all the drugs on a list 
# drug_list = list(df['DrugBankID'])

# Load entity file
drug_list = []
with open("./infer_drug.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['drug','ids'])
    for row_val in reader:
        drug_list.append(row_val['drug'])

print("There are", len(drug_list), "drugs in the infer_drug file")

There are 8104 drugs in the infer_drug file


In [5]:
# Get drug name to entity ID mappings
entity_map = {}

with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
 
# handle the ID mapping
drug_ids = []
drug_entity = {}  # mapping for drugs to entity for drugs in the DRKG only 
# disease_ids = []
for drug in drug_list:
    drug_entity.update({drug:entity_map[drug]})
    drug_ids.append(entity_map[drug])

In [6]:
# preview the dictionary
print({k: drug_entity[k] for k in list(drug_entity)[:5]})

# get the length of the dictionary
print(len(drug_entity))

{'Compound::DB00605': 9475, 'Compound::DB00983': 11010, 'Compound::DB01240': 7486, 'Compound::DB11755': 16376, 'Compound::DB12184': 4505}
8104


There are 8104 drugs in the `drug_entity` mapping. All the drugs have been mapped.

In [7]:
# Create a dataframe of the drug_entity dictionary
df = pd.DataFrame(list(drug_entity.items()), 
                  columns=['drug_bank_id', 'entity_id'])
df['drug_bank_id'] = df['drug_bank_id'].str.replace('Compound::', '')
df.head()

Unnamed: 0,drug_bank_id,entity_id
0,DB00605,9475
1,DB00983,11010
2,DB01240,7486
3,DB11755,16376
4,DB12184,4505


In [8]:
# Adding the SMILES column based on the smiles_mapping dictionary
df['SMILES'] = df['drug_bank_id'].map(smiles_mapping)

# Handle NaN values and convert 'SMILES' column to integers and 
# df['SMILES'] = df['SMILES'].fillna(-1).astype(int)

df.head()

Unnamed: 0,drug_bank_id,entity_id,SMILES
0,DB00605,9475,CC1=C(CC(O)=O)C2=CC(F)=CC=C2\C1=C/C1=CC=C(C=C1...
1,DB00983,11010,COC1=CC=C(CC(C)NCC(O)C2=CC(NC=O)=C(O)C=C2)C=C1
2,DB01240,7486,[H][C@]12C[C@@H](O)[C@H](\C=C\[C@@H](O)CCCCC)[...
3,DB11755,16376,CCCC1=CC(O)=C2[C@@H]3C=C(C)CC[C@H]3C(C)(C)OC2=C1
4,DB12184,4505,CC1(C)CC(=O)N(CCCCN2CCN(CC2)C2=NC=CC=N2)C(=O)C1


In [9]:
# Check for missing values (NaN) in the 'SMILES' column
df[df['SMILES'].isna()]

Unnamed: 0,drug_bank_id,entity_id,SMILES
14,DB12274,15567,
15,DB13375,15614,
27,DB05325,15694,
38,DB00026,10157,
51,DB01277,10394,
...,...,...,...
8077,DB14793,5019,
8078,DB09108,24669,
8082,DB12009,14702,
8084,DB13900,25019,


There are 1583 missing values of SMILES. We will filter them out.

In [10]:
df = df[df['SMILES'].notna()]
print(df.shape)
df.head()

(6521, 3)


Unnamed: 0,drug_bank_id,entity_id,SMILES
0,DB00605,9475,CC1=C(CC(O)=O)C2=CC(F)=CC=C2\C1=C/C1=CC=C(C=C1...
1,DB00983,11010,COC1=CC=C(CC(C)NCC(O)C2=CC(NC=O)=C(O)C=C2)C=C1
2,DB01240,7486,[H][C@]12C[C@@H](O)[C@H](\C=C\[C@@H](O)CCCCC)[...
3,DB11755,16376,CCCC1=CC(O)=C2[C@@H]3C=C(C)CC[C@H]3C(C)(C)OC2=C1
4,DB12184,4505,CC1(C)CC(=O)N(CCCCN2CCN(CC2)C2=NC=CC=N2)C(=O)C1


In [11]:
df.tail()

Unnamed: 0,drug_bank_id,entity_id,SMILES
8099,DB03445,22548,[H]\C(N[C@@]([H])(C(O)=O)[C@](C)(CN1C=CN=N1)S(...
8100,DB11200,24747,O.O.[Al+3].[Cl-].[Zr+4].NCC([O-])=O
8101,DB01897,21144,O=C1NNC(=O)C2=CC(=CC=C12)[N+]1=NC(\C=C\C2=CC=C...
8102,DB08404,24473,[H][C@](O)(C(=O)NCCC(=O)NCCSC(=O)CCCCC)C(C)(C)...
8103,DB05455,23655,CCOC1=CC2=C(C=C1)N(C(=O)C23CCC(CC3)OCCN4CCOCC4...


There are `6521` drugs from the `infer_drugs` files that have a SMILES value from the `drugbank_smiles.txt` file.

In [12]:
drug_ids[:5]

[9475, 11010, 7486, 16376, 4505]

The `drug_ids` list values are in the same order as the `entity_id` column in our dataframe.
It will be easy to concatenate the embedding of each drug to the dataframe based on the `entity_id.`

In [13]:
len(drug_ids)

8104

The lenght of `drug_ids` is still `8104`. We need to remove the drug_ids that are not in our dataframe.

In [14]:
# Get only the drug_ids that are in the dataframe
drug_ids = [k for k in drug_ids if k in set(df['entity_id'])]

# check
print(len(drug_ids))

6521


In [15]:
drug_ids[:10]

[9475, 11010, 7486, 16376, 4505, 7417, 6457, 7161, 2415, 4943]

Now the lenght of drug_ids match the lenght of the dataframe.

In [16]:
# Load embedding
entity_emb = np.load('../data/embed/DRKG_TransE_l2_entity.npy')

drug_ids = th.tensor(drug_ids).long()
drug_emb = th.tensor(entity_emb[drug_ids])

In [17]:
# view the entity embeddings
entity_emb

array([[-0.5888741 , -0.0872229 , -0.45159575, ...,  0.6187399 ,
        -0.51274306,  0.43644685],
       [ 0.03043327, -0.9927482 ,  0.31472597, ..., -0.03531377,
        -0.7016256 , -0.07200253],
       [-0.15260178, -0.31713626, -0.3052066 , ..., -0.5843045 ,
        -0.5971722 ,  0.4716604 ],
       ...,
       [ 0.58301383,  0.69486225,  0.5986209 , ...,  0.58480304,
         0.5156303 ,  0.27735808],
       [ 0.7441235 , -0.58081806,  0.3950408 , ...,  0.56884754,
        -0.6283229 ,  0.5594293 ],
       [-0.6101573 , -0.4708205 , -0.4374642 , ...,  0.42074913,
        -0.56538516,  0.621319  ]], dtype=float32)

In [18]:
# view only the drug embeddings, which is a subset of the entity embeddings
print(len(drug_emb))
drug_emb

6521


tensor([[-0.2715, -0.5940, -0.3701,  ..., -0.5073,  0.1592, -0.6702],
        [-0.4294, -0.3552, -0.4526,  ...,  0.6304,  0.4417, -0.4394],
        [-0.6724, -0.2223, -0.5301,  ...,  0.3787,  0.3645, -0.3003],
        ...,
        [ 0.1658,  0.6800, -0.5095,  ..., -0.6383, -0.5285, -0.6048],
        [ 0.5265,  0.6320, -0.5831,  ...,  0.5215, -0.5397,  0.5052],
        [-0.5288, -0.4380, -0.6211,  ..., -0.5878, -0.5739, -0.5481]])

In [19]:
# Convert the tensor to a Pandas DataFrame
emb_df = pd.DataFrame(drug_emb.numpy(), columns=[f'embedding_{i}' for i in range(400)])

# Concatenate the new DataFrame with the existing DataFrame
result_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)

# Display the updated DataFrame
result_df.head(10)

Unnamed: 0,drug_bank_id,entity_id,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,DB00605,9475,CC1=C(CC(O)=O)C2=CC(F)=CC=C2\C1=C/C1=CC=C(C=C1...,-0.271492,-0.593986,-0.370118,-0.132015,0.64742,0.267383,0.221824,...,-0.110515,0.747298,-0.595123,-0.354849,0.42657,-0.651937,-0.535123,-0.507326,0.159212,-0.670219
1,DB00983,11010,COC1=CC=C(CC(C)NCC(O)C2=CC(NC=O)=C(O)C=C2)C=C1,-0.429383,-0.355152,-0.452635,-0.493196,-0.350862,0.713262,0.443782,...,-0.542713,0.260568,-0.455372,-0.012449,-0.440367,-0.572628,-0.547223,0.630401,0.441735,-0.439394
2,DB01240,7486,[H][C@]12C[C@@H](O)[C@H](\C=C\[C@@H](O)CCCCC)[...,-0.672433,-0.222313,-0.530109,-0.546897,0.308746,-0.337016,0.149878,...,0.565954,-0.691679,-0.595132,-0.309585,0.08213,-0.466184,-0.515885,0.378655,0.364504,-0.300323
3,DB11755,16376,CCCC1=CC(O)=C2[C@@H]3C=C(C)CC[C@H]3C(C)(C)OC2=C1,-0.369791,0.667106,-0.406856,-0.587619,0.577712,0.643965,-0.662907,...,0.459833,-0.737107,-0.616846,-0.169433,-0.602083,-0.120658,-0.194525,-0.377465,0.231654,-0.369904
4,DB12184,4505,CC1(C)CC(=O)N(CCCCN2CCN(CC2)C2=NC=CC=N2)C(=O)C1,-0.151366,0.479758,-0.379089,-0.545514,0.743205,0.751715,-0.588066,...,0.42764,-0.665244,-0.28445,0.689696,-0.577339,-0.528556,0.713676,-0.300458,0.254337,0.02972
5,DB00404,7417,CC1=NN=C2CN=C(C3=CC=CC=C3)C3=C(C=CC(Cl)=C3)N12,-0.589351,-0.580602,-0.745123,-0.528509,0.593207,0.497097,0.048647,...,-0.678077,-0.345387,-0.128594,0.402087,0.085832,-0.420954,-0.31138,-0.376174,0.388722,-0.459413
6,DB01223,6457,NCCN.CN1C2=C(NC=N2)C(=O)N(C)C1=O.CN1C2=C(NC=N2...,-0.674336,-0.2665,-0.51234,-0.115845,-0.309642,0.626041,0.451178,...,-0.554611,0.078451,-0.180794,-0.4026,-0.749248,0.438113,-0.795475,-0.051928,0.437143,-0.286584
7,DB00572,7161,CN1[C@H]2CC[C@@H]1C[C@@H](C2)OC(=O)C(CO)C1=CC=...,-0.297685,-0.514792,-0.603773,-0.658746,0.420709,0.737039,0.000725,...,-0.306882,0.153852,-0.428181,-0.233794,-0.307042,-0.313246,-0.3099,0.096056,0.473331,-0.490021
8,DB00669,2415,CNS(=O)(=O)CC1=CC2=C(NC=C2CCN(C)C)C=C1,-0.453169,-0.235891,-0.643428,-0.610339,0.023665,-0.006115,-0.55945,...,0.119811,0.474479,0.602419,0.255796,-0.105829,-0.557198,0.094172,0.142557,0.284198,-0.518554
9,DB00494,4943,CCN(CC)C(=O)C(=C\C1=CC(=C(O)C(O)=C1)[N+]([O-])...,-0.535264,-0.502035,-0.528655,-0.10044,-0.082784,0.540049,0.041261,...,-0.705202,0.230018,-0.369495,0.308531,0.017748,-0.620017,0.475681,0.461897,0.462243,-0.646051


In [20]:
emb_df.shape

(6521, 400)

In [21]:
# Now remove the entity_id column
result_df.drop('entity_id', inplace=True, axis=1)
result_df.head()

Unnamed: 0,drug_bank_id,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
0,DB00605,CC1=C(CC(O)=O)C2=CC(F)=CC=C2\C1=C/C1=CC=C(C=C1...,-0.271492,-0.593986,-0.370118,-0.132015,0.64742,0.267383,0.221824,-0.669338,...,-0.110515,0.747298,-0.595123,-0.354849,0.42657,-0.651937,-0.535123,-0.507326,0.159212,-0.670219
1,DB00983,COC1=CC=C(CC(C)NCC(O)C2=CC(NC=O)=C(O)C=C2)C=C1,-0.429383,-0.355152,-0.452635,-0.493196,-0.350862,0.713262,0.443782,-0.526725,...,-0.542713,0.260568,-0.455372,-0.012449,-0.440367,-0.572628,-0.547223,0.630401,0.441735,-0.439394
2,DB01240,[H][C@]12C[C@@H](O)[C@H](\C=C\[C@@H](O)CCCCC)[...,-0.672433,-0.222313,-0.530109,-0.546897,0.308746,-0.337016,0.149878,-0.787017,...,0.565954,-0.691679,-0.595132,-0.309585,0.08213,-0.466184,-0.515885,0.378655,0.364504,-0.300323
3,DB11755,CCCC1=CC(O)=C2[C@@H]3C=C(C)CC[C@H]3C(C)(C)OC2=C1,-0.369791,0.667106,-0.406856,-0.587619,0.577712,0.643965,-0.662907,-0.538321,...,0.459833,-0.737107,-0.616846,-0.169433,-0.602083,-0.120658,-0.194525,-0.377465,0.231654,-0.369904
4,DB12184,CC1(C)CC(=O)N(CCCCN2CCN(CC2)C2=NC=CC=N2)C(=O)C1,-0.151366,0.479758,-0.379089,-0.545514,0.743205,0.751715,-0.588066,-0.573652,...,0.42764,-0.665244,-0.28445,0.689696,-0.577339,-0.528556,0.713676,-0.300458,0.254337,0.02972


In [22]:
result_df.tail(50)

Unnamed: 0,drug_bank_id,SMILES,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,...,embedding_390,embedding_391,embedding_392,embedding_393,embedding_394,embedding_395,embedding_396,embedding_397,embedding_398,embedding_399
6471,DB03064,CCCCCCCCCCC1=C(O)C(=O)NC1=O,-0.47231,0.647877,-0.089998,0.637965,0.650257,-0.514491,-0.495476,0.519425,...,0.480409,-0.593166,0.57326,0.626603,0.655367,0.580677,-0.380533,-0.587798,-0.533786,0.508143
6472,DB03495,[H][C@]1(C)O[C@]([H])(O[C@]2([H])[C@@]([H])(CO...,0.526832,0.587521,-0.611328,0.652814,0.554662,-0.5443,-0.587808,0.627838,...,-0.538768,-0.396637,0.62875,-0.631339,-0.385175,-0.637099,-0.533471,0.631073,0.569362,0.443134
6473,DB02449,[H][C@@](CC1=CNC2=CC=CC=C12)(NS(=O)(=O)C1=CC=C...,0.638514,0.666598,-0.514444,0.56846,0.529452,-0.673682,-0.620635,-0.498363,...,-0.497213,-0.570388,0.548918,-0.56468,0.56935,0.624825,-0.660542,-0.536366,-0.585697,-0.612271
6474,DB04000,N[C@@H](CN1C=C(Br)C(=O)NC1=O)C(O)=O,0.583121,0.659789,-0.581984,0.583822,-0.450307,-0.615717,0.451752,0.556506,...,-0.619083,-0.683417,0.464978,0.548842,-0.465284,0.577385,0.471111,0.343399,0.543876,-0.448048
6475,DB06988,[H][C@](N)(CC1=CC(\N=C\CC2=CC=CC=C2)=C(O)C=C1O...,0.541772,0.641629,-0.622913,0.551972,0.653459,-0.582977,-0.640539,0.592686,...,-0.503776,0.492067,0.59165,0.580336,0.61485,0.51183,-0.442861,-0.248784,0.428004,0.449482
6476,DB06873,OCCOCN1C=C(CC2=CC=CC(OCC3=CC=CC=C3)=C2)C(=O)NC1=O,0.562265,0.578854,-0.610164,0.554512,-0.433615,-0.601337,-0.637492,0.488312,...,-0.477708,-0.561762,-0.259173,0.553508,0.611597,0.580664,-0.603137,-0.363213,-0.507402,0.493747
6477,DB02790,[H]N1C(=O)C=CN([C@@H]2O[C@H](COP(O)(=O)OP(O)(=...,0.529424,0.653089,0.457046,-0.590788,0.656969,-0.574253,-0.636904,-0.5053,...,-0.623755,0.519024,0.553192,-0.606682,0.470672,0.507077,-0.610315,-0.38007,0.407472,0.462418
6478,DB03558,OC(=O)CCNC(=O)[C@H]1CCCN1S(=O)(=O)C1=CC=C(C=C1...,-0.393805,0.548372,-0.611566,0.530904,0.550117,-0.57444,-0.545503,0.447524,...,-0.57184,0.526975,0.544634,-0.581113,0.613318,0.615725,-0.598594,0.572767,0.441535,-0.438014
6479,DB02133,[Mg++].CCC1=C(C)C2=N\C\1=C/C1=C(C)C3=C([N-]1)\...,-0.410484,0.558169,-0.58063,-0.541515,-0.461676,-0.397205,-0.603115,-0.486119,...,-0.592593,0.402756,0.579265,-0.538386,0.606561,0.503238,0.561109,0.535494,-0.535713,-0.623237
6480,DB04662,[H][C@@](CC)(CO)NC1=NC2=C(N=CN2C(C)C)C(NCC2=CC...,-0.602227,0.560188,-0.525971,0.565391,0.609303,-0.568859,-0.543499,0.623162,...,-0.502594,-0.562198,0.140094,0.346223,0.536255,-0.56941,0.579854,-0.643193,0.619739,0.578092


In [23]:
result_df.shape

(6521, 402)

The shape show that there are 6521 rows. There are no missing values, but let's be sure.

In [24]:
# check for missing values
result_df.isna().sum().sum()

0

In [25]:
# save to a CSV file
result_df.to_csv("smiles_embeddings_infer_drugs.csv")