# **Here, we will be checking the common drugs in the RNA-Seq dataset and the cleaned training dataset**

## Read the training dataset, standardize the smiles using rdkit

In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem

In [9]:
# Read the training set
df_train_set = pd.read_csv('../../data/training_dataset_clean.csv')
df_train_set

Unnamed: 0,DRUG NAME,MDA7+D1+1,MDA7+D1+2,MDA7+D1+3,MDA7+D2+1,MDA7+D2+2,MDA7+D2+3,CatalogNumber,Item Name,M.w.,Formula,SMILES,URL
0,Angiotensin II,0.467198,0.492122,0.500198,0.651233,0.762780,0.484070,A1042,Angiotensin II,1046.2,C50H71N13O12,CCC(C)C(C(=O)NC(CC1=CN=CN1)C(=O)N2CCCC2C(=O)NC...,http://www.apexbt.com/search.php?catalog=A1042
1,Levetiracetam,0.424844,0.503705,0.433523,0.517083,0.498133,0.485443,A1198,Levetiracetam,170.21,C8H14N2O2,CCC(C(=O)N)N1CCCC1=O,http://www.apexbt.com/search.php?catalog=A1198
2,Daptomycin,0.462072,0.509482,0.424083,0.451883,0.495565,0.472871,A1206,Daptomycin,1620.67,C72H101N17O26,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,http://www.apexbt.com/search.php?catalog=A1206
3,Lansoprazole,0.464806,0.494925,0.502533,0.491361,0.479975,0.443513,A1229,Lansoprazole,369.36,C16H14F3N3O2S,CC1=C(C=CN=C1CS(=O)C2=NC3=CC=CC=C3N2)OCC(F)(F)F,http://www.apexbt.com/search.php?catalog=A1229
4,Adapalene,0.407362,0.425062,0.490854,0.489590,0.513733,0.395002,A1267,Adapalene,412.52,C28H28O3,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,http://www.apexbt.com/search.php?catalog=A1267
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,Chondroitin sulfate,0.750820,0.741770,0.826674,0.835227,0.807444,0.802552,N2730,Chondroitin sulfate,489.45,(C14H21NO14S)n,CC(=O)NC1C(C(C(OC1O)CO)O)OC2C(C(C=C(O2)C(=O)O)O)O,http://www.apexbt.com/search.php?catalog=N2730
1961,Aucubin,0.714064,0.849337,0.563837,0.869225,0.700734,0.823960,N2758,Aucubin,346.33,C15H22O9,C1=COC(C2C1C(C=C2CO)O)OC3C(C(C(C(O3)CO)O)O)O,http://www.apexbt.com/search.php?catalog=N2758
1962,Cepharanthine,0.592418,0.708048,0.883598,0.811905,0.849278,0.827013,N2771,Cepharanthine,606.71,C37H38N2O6,CN1CCC2=CC3=C(C4=C2C1CC5=CC=C(C=C5)OC6=C(C=CC(...,http://www.apexbt.com/search.php?catalog=N2771
1963,4-Aminobutyric acid,0.705559,0.941525,0.717699,0.887134,0.716770,0.617353,P10002,4-Aminobutyric acid,103.12,C4H9NO2,C(CC(=O)O)CN,http://www.apexbt.com/search.php?catalog=P10002


In [10]:
# Standardize the SMILES using the RDKit
def standardize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol)
    except:
        return smiles

df_train_set['standardized_SMILES'] = df_train_set['SMILES'].apply(standardize_smiles)
df_train_set

Unnamed: 0,DRUG NAME,MDA7+D1+1,MDA7+D1+2,MDA7+D1+3,MDA7+D2+1,MDA7+D2+2,MDA7+D2+3,CatalogNumber,Item Name,M.w.,Formula,SMILES,URL,standardized_SMILES
0,Angiotensin II,0.467198,0.492122,0.500198,0.651233,0.762780,0.484070,A1042,Angiotensin II,1046.2,C50H71N13O12,CCC(C)C(C(=O)NC(CC1=CN=CN1)C(=O)N2CCCC2C(=O)NC...,http://www.apexbt.com/search.php?catalog=A1042,CC(=O)O.CCC(C)C(NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(N...
1,Levetiracetam,0.424844,0.503705,0.433523,0.517083,0.498133,0.485443,A1198,Levetiracetam,170.21,C8H14N2O2,CCC(C(=O)N)N1CCCC1=O,http://www.apexbt.com/search.php?catalog=A1198,CCC(C(N)=O)N1CCCC1=O
2,Daptomycin,0.462072,0.509482,0.424083,0.451883,0.495565,0.472871,A1206,Daptomycin,1620.67,C72H101N17O26,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,http://www.apexbt.com/search.php?catalog=A1206,CCCCCCCCCC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)NC(CC(...
3,Lansoprazole,0.464806,0.494925,0.502533,0.491361,0.479975,0.443513,A1229,Lansoprazole,369.36,C16H14F3N3O2S,CC1=C(C=CN=C1CS(=O)C2=NC3=CC=CC=C3N2)OCC(F)(F)F,http://www.apexbt.com/search.php?catalog=A1229,Cc1c(OCC(F)(F)F)ccnc1CS(=O)c1nc2ccccc2[nH]1
4,Adapalene,0.407362,0.425062,0.490854,0.489590,0.513733,0.395002,A1267,Adapalene,412.52,C28H28O3,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,http://www.apexbt.com/search.php?catalog=A1267,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1960,Chondroitin sulfate,0.750820,0.741770,0.826674,0.835227,0.807444,0.802552,N2730,Chondroitin sulfate,489.45,(C14H21NO14S)n,CC(=O)NC1C(C(C(OC1O)CO)O)OC2C(C(C=C(O2)C(=O)O)O)O,http://www.apexbt.com/search.php?catalog=N2730,CC(=O)NC1C(O)OC(CO)C(O)C1OC1OC(C(=O)O)=CC(O)C1O
1961,Aucubin,0.714064,0.849337,0.563837,0.869225,0.700734,0.823960,N2758,Aucubin,346.33,C15H22O9,C1=COC(C2C1C(C=C2CO)O)OC3C(C(C(C(O3)CO)O)O)O,http://www.apexbt.com/search.php?catalog=N2758,OCC1=CC(O)C2C=COC(OC3OC(CO)C(O)C(O)C3O)C12
1962,Cepharanthine,0.592418,0.708048,0.883598,0.811905,0.849278,0.827013,N2771,Cepharanthine,606.71,C37H38N2O6,CN1CCC2=CC3=C(C4=C2C1CC5=CC=C(C=C5)OC6=C(C=CC(...,http://www.apexbt.com/search.php?catalog=N2771,COc1ccc2cc1Oc1ccc(cc1)CC1c3c(cc4c(c3Oc3cc5c(cc...
1963,4-Aminobutyric acid,0.705559,0.941525,0.717699,0.887134,0.716770,0.617353,P10002,4-Aminobutyric acid,103.12,C4H9NO2,C(CC(=O)O)CN,http://www.apexbt.com/search.php?catalog=P10002,NCCCC(=O)O


In [30]:
df_train_set.to_csv('../../data/training_dataset_clean.csv', index=False)

## Collect the SMILES information of RNA-Seq drugs from Pubchem, standardize them

In [14]:
# Read the drug names in rna-seq dataset
df_drug_names = pd.read_csv('../../data/gene_expression_drug_names.csv')
df_drug_names

Unnamed: 0,DRUG_NAME
0,1-EBIO
1,1-HYDROXYANTHRAQUINONE
2,1-HYDROXYPHENAZINE
3,1-NAPHTHYLAMINE
4,1-NITRONAPHTHALENE
...,...
33600,zolpidem
33601,zonisamide
33602,zopiclone
33603,zosuquidar


In [5]:
# Extract the SMILES for the drugs in the rna-seq dataset from pubchem
import requests
import time

def get_smiles(drug_name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/CanonicalSMILES/JSON"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises a HTTPError if the status is 4xx, 5xx
        data = response.json()
        return data['PropertyTable']['Properties'][0]['CanonicalSMILES']
    except (requests.exceptions.HTTPError, IndexError, KeyError):
        print(f"Could not find SMILES for {drug_name}")
        return 'NA'

In [6]:
count = len(df_drug_names)

smiles = []
idx = []
for i,drug in enumerate(df_drug_names['DRUG_NAME']):
    if drug.startswith('BRD'):
        continue
    else:
        idx.append(i)
        smiles.append(get_smiles(drug))

# drug_names = list(df_drug_names['DRUG_NAME'])
smiles = pd.DataFrame({'Index': idx, 'SMILES': smiles})
smiles.to_csv(f'../../data/rna_seq_drugs_smiles_0_{count}.csv', index=False)

Could not find SMILES for 1B
Could not find SMILES for 6-nitrodopamine
Could not find SMILES for 614
Could not find SMILES for 620
Could not find SMILES for 642
Could not find SMILES for 646
Could not find SMILES for 650
Could not find SMILES for 7b-cis
Could not find SMILES for 943
Could not find SMILES for 949
Could not find SMILES for ACDPP
Could not find SMILES for AG-592
Could not find SMILES for ALW-II-38-3
Could not find SMILES for AMZ-30
Could not find SMILES for APEC
Could not find SMILES for ARC-239
Could not find SMILES for ARG-A1-2
Could not find SMILES for ARG-CSC-18
Could not find SMILES for ARG-CSC-22
Could not find SMILES for ARG-CSC-26
Could not find SMILES for ARG-CSC-81
Could not find SMILES for ARG-CSC-91
Could not find SMILES for ARG-TC-41
Could not find SMILES for ASN-05257430
Could not find SMILES for ASSINEX-D328
Could not find SMILES for AT-SUMO-1
Could not find SMILES for AT1-SHH-09
Could not find SMILES for AT1-SHH-29
Could not find SMILES for BAS-00535043
Co

In [6]:
df_rna_seq_drug_smiles = pd.read_csv(f'../../data/rna_seq_drugs_smiles_0_33605.csv')
df_rna_seq_drug_smiles = df_rna_seq_drug_smiles.dropna(subset=['SMILES'])
df_rna_seq_drug_smiles

Unnamed: 0,Index,SMILES
0,0,CCN1C2=CC=CC=C2NC1=O
1,1,C1=CC=C2C(=C1)C(=O)C3=C(C2=O)C(=CC=C3)O
2,2,C1=CC=C2C(=C1)N=C3C=CC=C(C3=N2)O
3,3,C1=CC=C2C(=C1)C=CC=C2N
4,4,C1=CC=C2C(=C1)C=CC=C2[N+](=O)[O-]
...,...,...
5403,33597,C1CCN(CC1)CC2=CC(=CC=C2)OCCCNC3=NC4=CC=CC=C4S3
5405,33599,CN(C)CCC1=CNC2=C1C=C(C=C2)CC3COC(=O)N3
5407,33601,C1=CC=C2C(=C1)C(=NO2)CS(=O)(=O)N
5408,33602,CN1CCN(CC1)C(=O)OC2C3=NC=CN=C3C(=O)N2C4=NC=C(C...


In [13]:
# Standardize the SMILES using the RDKit
df_rna_seq_drug_smiles['standardized_SMILES'] = df_rna_seq_drug_smiles['SMILES'].apply(standardize_smiles)
df_rna_seq_drug_smiles = df_rna_seq_drug_smiles.dropna(subset=['standardized_SMILES']).drop_duplicates(subset=['standardized_SMILES'])
df_rna_seq_drug_smiles

Unnamed: 0,Index,SMILES,standardized_SMILES
0,0,CCN1C2=CC=CC=C2NC1=O,CCn1c(=O)[nH]c2ccccc21
1,1,C1=CC=C2C(=C1)C(=O)C3=C(C2=O)C(=CC=C3)O,O=C1c2ccccc2C(=O)c2c(O)cccc21
2,2,C1=CC=C2C(=C1)N=C3C=CC=C(C3=N2)O,Oc1cccc2nc3ccccc3nc12
3,3,C1=CC=C2C(=C1)C=CC=C2N,Nc1cccc2ccccc12
4,4,C1=CC=C2C(=C1)C=CC=C2[N+](=O)[O-],O=[N+]([O-])c1cccc2ccccc12
...,...,...,...
5403,33597,C1CCN(CC1)CC2=CC(=CC=C2)OCCCNC3=NC4=CC=CC=C4S3,c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3s2)c1
5405,33599,CN(C)CCC1=CNC2=C1C=C(C=C2)CC3COC(=O)N3,CN(C)CCc1c[nH]c2ccc(CC3COC(=O)N3)cc12
5407,33601,C1=CC=C2C(=C1)C(=NO2)CS(=O)(=O)N,NS(=O)(=O)Cc1noc2ccccc12
5408,33602,CN1CCN(CC1)C(=O)OC2C3=NC=CN=C3C(=O)N2C4=NC=C(C...,CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1


In [27]:
# Add drug names to the dataset
indices = df_rna_seq_drug_smiles['Index'].to_list()
df_rna_seq_drug_smiles['DRUG_NAME'] = df_drug_names['DRUG_NAME'].iloc[indices].to_list()
df_rna_seq_drug_smiles

Unnamed: 0,Index,SMILES,standardized_SMILES,DRUG_NAME
0,0,CCN1C2=CC=CC=C2NC1=O,CCn1c(=O)[nH]c2ccccc21,1-EBIO
1,1,C1=CC=C2C(=C1)C(=O)C3=C(C2=O)C(=CC=C3)O,O=C1c2ccccc2C(=O)c2c(O)cccc21,1-HYDROXYANTHRAQUINONE
2,2,C1=CC=C2C(=C1)N=C3C=CC=C(C3=N2)O,Oc1cccc2nc3ccccc3nc12,1-HYDROXYPHENAZINE
3,3,C1=CC=C2C(=C1)C=CC=C2N,Nc1cccc2ccccc12,1-NAPHTHYLAMINE
4,4,C1=CC=C2C(=C1)C=CC=C2[N+](=O)[O-],O=[N+]([O-])c1cccc2ccccc12,1-NITRONAPHTHALENE
...,...,...,...,...
5403,33597,C1CCN(CC1)CC2=CC(=CC=C2)OCCCNC3=NC4=CC=CC=C4S3,c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3s2)c1,zolantidine
5405,33599,CN(C)CCC1=CNC2=C1C=C(C=C2)CC3COC(=O)N3,CN(C)CCc1c[nH]c2ccc(CC3COC(=O)N3)cc12,zolmitriptan
5407,33601,C1=CC=C2C(=C1)C(=NO2)CS(=O)(=O)N,NS(=O)(=O)Cc1noc2ccccc12,zonisamide
5408,33602,CN1CCN(CC1)C(=O)OC2C3=NC=CN=C3C(=O)N2C4=NC=C(C...,CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1,zopiclone


## Select the common standardized SMILES in both datasets

In [29]:
df_merged = pd.merge(df_train_set, df_rna_seq_drug_smiles, left_on='standardized_SMILES', right_on='standardized_SMILES')
df_merged

Unnamed: 0,DRUG NAME,MDA7+D1+1,MDA7+D1+2,MDA7+D1+3,MDA7+D2+1,MDA7+D2+2,MDA7+D2+3,CatalogNumber,Item Name,M.w.,Formula,SMILES,URL,standardized_SMILES_x,Index,standardized_SMILES_y,DRUG_NAME
0,Levetiracetam,0.424844,0.503705,0.433523,0.517083,0.498133,0.485443,A1198,Levetiracetam,170.21,C8H14N2O2,CCC(C(=O)N)N1CCCC1=O,http://www.apexbt.com/search.php?catalog=A1198,CCC(C(N)=O)N1CCCC1=O,32094,CCC(C(N)=O)N1CCCC1=O,levetiracetam
1,Adapalene,0.407362,0.425062,0.490854,0.489590,0.513733,0.395002,A1267,Adapalene,412.52,C28H28O3,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,http://www.apexbt.com/search.php?catalog=A1267,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...,30519,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...,adapalene
2,Exemestane,0.488426,0.523998,0.525636,0.518446,0.528243,0.515495,A1296,Exemestane,296.4,C20H24O2,CC12CCC3C(C1CCC2=O)CC(=C)C4=CC(=O)C=CC34C,http://www.apexbt.com/search.php?catalog=A1296,C=C1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12,31594,C=C1CC2C(CCC3(C)C(=O)CCC23)C2(C)C=CC(=O)C=C12,exemestane
3,Gatifloxacin,0.474342,0.493097,0.476658,0.447818,0.486796,0.518984,A1313,Gatifloxacin,375.39,C19H22FN3O4,CC1CN(CCN1)C2=C(C=C3C(=C2OC)N(C=C(C3=O)C(=O)O)...,http://www.apexbt.com/search.php?catalog=A1313,COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC...,31750,COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC...,gatifloxacin
4,Nelarabine,0.361652,0.485283,0.485534,0.483784,0.495411,0.507489,A1379,Nelarabine,297.27,C11H15N5O5,COC1=NC(=NC2=C1N=CN2C3C(C(C(O3)CO)O)O)N,http://www.apexbt.com/search.php?catalog=A1379,COc1nc(N)nc2c1ncn2C1OC(CO)C(O)C1O,32471,COc1nc(N)nc2c1ncn2C1OC(CO)C(O)C1O,nelarabine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,Umbelliferone,0.577231,0.676448,0.829431,0.747115,0.750893,0.663639,N2282,Umbelliferone,162.14,C9H6O3,C1=CC(=CC2=C1C=CC(=O)O2)O,http://www.apexbt.com/search.php?catalog=N2282,O=c1ccc2ccc(O)cc2o1,33491,O=c1ccc2ccc(O)cc2o1,umbelliferone
342,Allantoin,0.523387,0.686583,0.769501,0.600037,0.617730,0.717783,N2301,Allantoin,158.12,C4H6O3N4,C1(C(=O)NC(=O)N1)NC(=O)N,http://www.apexbt.com/search.php?catalog=N2301,NC(=O)NC1NC(=O)NC1=O,30558,NC(=O)NC1NC(=O)NC1=O,allantoin
343,Diosimin,0.592127,0.677675,0.636568,0.864714,0.674697,0.775937,N2535,Diosimin,608.17,C28H32O15,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=CC(=C4C(=C3)O...,http://www.apexbt.com/search.php?catalog=N2535,COc1ccc(-c2cc(=O)c3c(O)cc(OC4OC(COC5OC(C)C(O)C...,31372,COc1ccc(-c2cc(=O)c3c(O)cc(OC4OC(COC5OC(C)C(O)C...,diosmin
344,Nicotine,0.669430,0.910738,0.474867,0.610019,0.592478,0.578326,N2703,Nicotine,162.23,C10H14N2,CN1CCCC1C2=CN=CC=C2,http://www.apexbt.com/search.php?catalog=N2703,CN1CCCC1c1cccnc1,32491,CN1CCCC1c1cccnc1,nicotine


## Conclusion: Extracting and comparing SMILES is not effective. Have to go for manual comparison by drug name