In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from MSI.load_msi_data import LoadData
import sqlite3
import ast
import pickle

## Process database (DCDB, C_DCDB, TWOSIDES)

### DCDB: Drug Combination Database

In [2]:
dcdb = pd.read_csv('database/dcdb/dcdb.txt', sep='\t')
dcdb_id = pd.read_csv('database/dcdb/dcdb_components_identifier.txt', sep='\t')
dc2dcu = pd.read_csv('database/dcdb/DC_TO_DCU.txt', sep='\t')
dcu_usage = pd.read_csv('database/dcdb/DC_USAGE.txt', sep='\t')

In [3]:
print(len(dcdb))
dcdb.head()

1363


Unnamed: 0,DrugCombination_ID,Components_Name,Componets_ID
0,DC000348,Bismuth Subsalicylate; Metronidazole; Tetracyc...,DCC0187/DCC0235/DCC0338
1,DC000349,Brimonidine; Timolol,DCC0072/DCC0106
2,DC000350,Betamethasone; Calcipotriol,DCC0095/DCC0358
3,DC000351,Betamethasone; Clotrimazole,DCC0033/DCC0095
4,DC000352,Cerulenin; Levodopa,DCC0274/DCC0326


In [4]:
# leave only "EFFICACIOUS" combinations
dc2dcu_dict = {}
for i in tqdm(range(len(dc2dcu))):
    dc = dc2dcu['DC_ID'][i]
    dcu = dc2dcu['DCU_ID'][i]
    if dc not in dc2dcu_dict:
        dc2dcu_dict[dc] = set([dcu])
    else:
        dc2dcu_dict[dc].add(dcu)

efficacious_dcu = set()
for i in tqdm(range(len(dcu_usage))):
    dcu = dcu_usage['DCU_ID'][i]
    if dcu_usage['EFFICACY'][i] == 'Efficacious':
        efficacious_dcu.add(dcu)

efficacious_dc = set()
for key in dc2dcu_dict.keys():
    if dc2dcu_dict[key].issubset(efficacious_dcu):
        efficacious_dc.add(key)

100%|██████████| 1793/1793 [00:00<00:00, 67960.63it/s]
100%|██████████| 1813/1813 [00:00<00:00, 99721.63it/s]


In [5]:
dcdb = dcdb.loc[dcdb['DrugCombination_ID'].isin(efficacious_dc)]
print(len(dcdb))

1037


In [6]:
dcdb_id.head()

Unnamed: 0,DCC_ID,Name,CAS_Number,BindingDB,ChEBI,DrugBank,KEGG Compound,KEGG Drug,PDB,PharmGKB,PubChem Compound,PubChem Substance
0,DCC1838,Methyclothiazide,CAS:135-07-9,,,DB00232,,D00656,,,,7847722.0
1,DCC0413,Garenoxacin,CAS:194804-75-6,,,,,D02540,,,124093.0,
2,DCC0520,Betaine,CAS:107-43-7,,,DB01494,,D07523,,,247.0,
3,DCC0639,Atrasentan,CAS:173937-91-2,,,,,D03009,,,17397165.0,
4,DCC0029,Lovastatin,CAS:75330-75-5,,40303.0,DB00227,C07074,D00359,803.0,PA450272,53232.0,46508223.0


In [7]:
dcdb_dict = dict(zip(dcdb_id['DCC_ID'], dcdb_id['DrugBank']))
bad_keys = []
for key, value in dcdb_dict.items():
    if type(value) != str:
        bad_keys.append(key)
    if type(value) == str and not value.startswith('DB'):
        bad_keys.append(key)
print(len(bad_keys))

for key in bad_keys:
    del dcdb_dict[key]

171


In [8]:
dcdb_lst = []
for x in tqdm(dcdb['Componets_ID']):
    dc_ids = x.split('/')
    db_ids = []
    for id_ in dc_ids:
        db_id = dcdb_dict.get(id_) # return None if not exist in the dictionary
        db_ids.append(db_id)
    if (np.nan not in db_ids) and (None not in db_ids):
        dcdb_lst.append(set(db_ids))
print(len(dcdb_lst))

100%|██████████| 1037/1037 [00:00<00:00, 174861.03it/s]

825





In [9]:
dcdb_lst_final = []
for x in dcdb_lst:
    if x not in dcdb_lst_final:
        dcdb_lst_final.append(x)
print(len(dcdb_lst_final))

825


In [10]:
# Leave drug combinations with only 2 drugs
dcdb_lst_dual = []
for x in dcdb_lst_final:
    if len(x) == 2:
        dcdb_lst_dual.append(list(x))
print(len(dcdb_lst_dual))

598


In [11]:
pd.DataFrame(dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('database/processed/DCDB_dual.tsv', sep='\t', index=None)

In [12]:
dcdb_dual = pd.read_csv('database/processed/DCDB_dual.tsv', sep='\t')
dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB00373,DB00484
1,DB00443,DB02300
2,DB00257,DB00443
3,DB01235,DB01034
4,DB01222,DB00983


### C-DCDB: Continuous Drug Combination Database

"c_dcdb.sqlite" file can be downloaded from https://icc.ise.bgu.ac.il/medical_ai/CDCDB/

In [13]:
con = sqlite3.connect('database/c_dcdb/c_dcdb.sqlite')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('aact_combs',), ('aact_combs_with_identifiers',), ('all_combs_unormalized',), ('conditions',), ('design_group',), ('mesh_terms',), ('orangebook_combs',), ('patents_ipc',), ('patents_metadata',), ('references',), ('transformed_patents_drug',), ('trials',), ('web_preview',)]


In [14]:
df = pd.read_sql_query("SELECT * from all_combs_unormalized", con)
df.head()

Unnamed: 0,index,drugs,drugbank_identifiers,pubchem_identifiers,source_id,source
0,0,"[[""foscarnet"", ""Foscavir""], [""ganciclovir"", ""C...","[""DB00529"", ""DB01004""]","[""-1"", ""-1""]",NCT00000134,clinicaltrials.gov
1,1,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
2,2,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
3,3,"[[""cycloserine""], [""clozapine""]]","[""DB00260"", ""DB00363""]","[""CID6234"", ""CID2818""]",NCT00000372,clinicaltrials.gov
4,4,"[[""PTH protein, human"", ""teriparatide""], [""ale...","[""DB06285"", ""DB00630""]","[""CID16133850"", ""CID2088""]",NCT00000400,clinicaltrials.gov


In [15]:
c_dcdb_lst = []
for x in df['drugbank_identifiers']:
    x = ast.literal_eval(x)
    c_dcdb_lst.append(set(x))
print(len(c_dcdb_lst))

43865


In [16]:
c_dcdb_lst_final = []
for x in c_dcdb_lst:
    flag = 0
    for y in x:
        if not y.startswith('DB'):
            flag += 1
    if flag != 0:
        continue

    if x not in c_dcdb_lst_final:
        c_dcdb_lst_final.append(x)

print(len(c_dcdb_lst_final))

15336


In [17]:
# Leave drug combinations with only 2 drugs
c_dcdb_lst_dual = []
for x in c_dcdb_lst_final:
    if len(x) == 2:
        c_dcdb_lst_dual.append(list(x))
print(len(c_dcdb_lst_dual))

9092


In [18]:
pd.DataFrame(c_dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('database/processed/C_DCDB_dual.tsv', sep='\t', index=None)
c_dcdb_dual = pd.read_csv('database/processed/C_DCDB_dual.tsv', sep='\t')
c_dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB01004,DB00529
1,DB00260,DB00363
2,DB06285,DB00630
3,DB01104,DB00704
4,DB00375,DB00227


### TWOSIDES

In [19]:
from tdc.multi_pred import DDI

In [20]:
data = DDI(name='TWOSIDES')
twosides = data.get_data()[['Drug1_ID', 'Drug2_ID']]
twosides.drop_duplicates(inplace=True)
print(len(twosides))
twosides.head()

Found local copy...
Loading...
Done!


63473


Unnamed: 0,Drug1_ID,Drug2_ID
0,CID000002173,CID000003345
56,CID000005206,CID000009433
68,CID000003929,CID000150610
92,CID000001302,CID000005064
226,CID000005267,CID000010631


In [21]:
db_table = pd.read_csv('database/drugbank_all_drug_links.csv')[['DrugBank ID', 'PubChem Compound ID']]
db_table.dropna(inplace=True)
db_table['PubChem Compound ID'] = db_table['PubChem Compound ID'].apply(lambda x: str(int(x)))
db_table.head()

Unnamed: 0,DrugBank ID,PubChem Compound ID
5,DB00006,16129704
13,DB00014,5311128
25,DB00027,45267103
47,DB00050,25074887
74,DB00080,16134395


In [22]:
cid2dbid = dict(zip(db_table['PubChem Compound ID'], db_table['DrugBank ID']))

In [23]:
def convert_cid2dbid(cid):
    cid = str(int(cid[3:]))
    if cid in cid2dbid.keys():
        return cid2dbid[cid]
    else:
        return np.nan

In [24]:
twosides['drug_1'] = twosides['Drug1_ID'].apply(convert_cid2dbid)
twosides['drug_2'] = twosides['Drug2_ID'].apply(convert_cid2dbid)
twosides = twosides[['drug_1', 'drug_2']]
twosides.dropna(inplace=True)
twosides.drop_duplicates(inplace=True)
twosides.reset_index(drop=True, inplace=True)

In [25]:
twosides

Unnamed: 0,drug_1,drug_2
0,DB01236,DB01223
1,DB00533,DB00897
2,DB00231,DB00726
3,DB01068,DB01045
4,DB01204,DB01101
...,...,...
19317,DB00996,DB00331
19318,DB01143,DB00958
19319,DB00321,DB00264
19320,DB00678,DB00203


In [26]:
twosides.to_csv('database/processed/TWOSIDES_dual.tsv', sep='\t', index=None)
twosides_dual = pd.read_csv('database/processed/TWOSIDES_dual.tsv', sep='\t')
twosides_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB01236,DB01223
1,DB00533,DB00897
2,DB00231,DB00726
3,DB01068,DB01045
4,DB01204,DB01101


## Create labels (Drug Combination, Drug-Drug Interaction)

In [27]:
'''DataLoader'''
dataloader = LoadData()

# get_dict
drug_id2name, drug_name2id = dataloader.get_dict(type='drug')
# protein_id2name, protein_name2id = dataloader.get_dict(type='protein')
# indication_id2name, indication_name2id = dataloader.get_dict(type='indication')
# biof_id2name, biof_name2id = dataloader.get_dict(type='biological_function')

Check Common Drugs Between MSI and DCDB / C_DCDB / TWOSIDES_DDI

In [28]:
msi_drug_list = list(drug_id2name.keys())

dcdb_dual = pd.read_csv('database/processed/DCDB_dual.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('database/processed/C_DCDB_dual.tsv', sep='\t')
twosides_ddi = pd.read_csv('database/processed/TWOSIDES_dual.tsv', sep='\t')

In [29]:
def leave_common_drugs(df):
    count = 0
    for row in df.iterrows():
        a, b = row[1][0], row[1][1]
        if a in msi_drug_list and b in msi_drug_list:
            count += 1
    print(f'Original number of rows: {len(df)}')
    print(f'Filtered number of rows: {count}')
    return df[df['drug_1'].isin(msi_drug_list) & df['drug_2'].isin(msi_drug_list)]

In [30]:
dcdb_dual = leave_common_drugs(dcdb_dual)
c_dcdb_dual = leave_common_drugs(c_dcdb_dual)
twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 598
Filtered number of rows: 455
Original number of rows: 9092
Filtered number of rows: 4221
Original number of rows: 19322
Filtered number of rows: 16589


Save DCDB, C_DCDB

In [31]:
dcdb_dual.to_csv('data/labels/DCDB_msi.tsv', sep='\t', index=None)
c_dcdb_dual.to_csv('data/labels/C_DCDB_msi.tsv', sep='\t', index=None)

Combine DCDB, C_DCDB

In [32]:
dcdb_dual = pd.read_csv('data/labels/DCDB_msi.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('data/labels/C_DCDB_msi.tsv', sep='\t')

dcdb_drug_set = []
for row in dcdb_dual.iterrows():
    dcdb_drug_set.append(set([row[1][0], row[1][1]]))
c_dcdb_drug_set = []
for row in c_dcdb_dual.iterrows():
    c_dcdb_drug_set.append(set([row[1][0], row[1][1]]))

not_common_idx = []
for i in range(len(dcdb_drug_set)):
    if dcdb_drug_set[i] not in c_dcdb_drug_set:
        not_common_idx.append(i)

dc = pd.concat([c_dcdb_dual, dcdb_dual.iloc[not_common_idx]], axis=0)
dc.reset_index(drop=True, inplace=True)

dc.to_csv('data/labels/DC_combined_msi.tsv', sep='\t', index=None)

Leave unique Drug-Drug Interaction pair (DDI - DC)

In [33]:
dc = pd.read_csv('data/labels/DC_combined_msi.tsv', sep='\t')

twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 16589
Filtered number of rows: 16589


In [34]:
def leave_unique_pair(dc, ddi):
    dc_set_list = []
    for row in dc.iterrows():
        dc_set_list.append(set([row[1][0], row[1][1]]))
    ddi_set_list = []
    for row in ddi.iterrows():
        ddi_set_list.append(set([row[1][0], row[1][1]]))
    not_common_idx = []
    for i in range(len(ddi_set_list)):
        if ddi_set_list[i] not in dc_set_list:
            not_common_idx.append(i)
    temp = ddi.iloc[not_common_idx]
    return temp.reset_index(drop=True)

In [35]:
twosides_ddi = leave_unique_pair(dc, twosides_ddi)
print(f"Number of TWOSIDES pairs: {len(twosides_ddi)}")

Number of TWOSIDES pairs: 16157


In [36]:
twosides_ddi.to_csv('data/labels/TWOSIDES_msi.tsv', sep='\t', index=None)

## DB ID to SMILES dictionary

In [3]:
db2smiles_df = pd.read_csv('database/structure_external_links_all.csv', sep=',')

In [4]:
db2smiles_df

Unnamed: 0,DrugBank ID,Name,CAS Number,Drug Groups,InChIKey,InChI,SMILES,Formula,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,ChEMBL ID,HET ID,ChemSpider ID,BindingDB ID
0,DB00006,Bivalirudin,128270-60-0,approved; investigational,OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,C98H138N24O33,,D03136,16129704.0,46507415.0,59173.0,CHEMBL2103749,,10482069.0,50248103.0
1,DB00007,Leuprolide,53714-56-0,approved; investigational,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,C59H84N16O12,C07612,D08113,,46507635.0,6427.0,CHEMBL1201199,,571356.0,50369395.0
2,DB00014,Goserelin,65807-02-5,approved,BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,C59H84N18O14,,D00573,5311128.0,46507336.0,5523.0,CHEMBL1201247,,4470656.0,
3,DB00027,Gramicidin D,1405-97-6,approved,NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,C96H135N19O16,,D04369,45267103.0,46507412.0,,CHEMBL557217,,24623445.0,
4,DB00035,Desmopressin,16679-58-6,approved,NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,C46H64N14O12S2,C06944,D00291,,,4450.0,CHEMBL1429,,4470602.0,50205308.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11400,DB15671,Besifovir,441785-25-7,investigational,KDNSSKPZBDNJDF-UHFFFAOYSA-N,InChI=1S/C10H14N5O4P/c11-9-12-3-7-8(14-9)15(5-...,NC1=NC=C2N=CN(CC3(CC3)OCP(O)(O)=O)C2=N1,C10H14N5O4P,,,,,,CHEMBL329091,,4435660.0,
11401,DB15672,Rilematovir,1383450-81-4,investigational,GTQTUABHRCWVLL-UHFFFAOYSA-N,"InChI=1S/C21H20ClF3N4O3S/c1-33(31,32)8-2-7-27-...",CS(=O)(=O)CCCN1C(CN2C(=O)N(CC(F)(F)F)C3=C2C=NC...,C21H20ClF3N4O3S,,,,,,,6YA,58810205.0,
11402,DB15673,Lenacapavir,2189684-44-2,investigational,BRYXUCLEHAUSDY-WEWMWRJBSA-N,"InChI=1S/C39H32ClF10N7O5S2/c1-36(2,63(3,59)60)...",[H][C@]12C[C@@]1([H])C(F)(F)C1=C2C(=NN1CC(=O)N...,C39H32ClF10N7O5S2,,,,,,,,81367881.0,
11403,DB15674,Sisunatovir,1903763-82-5,investigational,JOPCJJSYRPUEDS-UHFFFAOYSA-N,InChI=1S/C23H22F4N4O/c24-15-3-4-16-19(11-15)31...,NCC1=CC2=C(C=C1)N(CCCC(F)(F)F)C(CN1C(=O)C3(CC3...,C23H22F4N4O,,,,,,,,64835198.0,


In [5]:
db2smiles_df = db2smiles_df[['DrugBank ID', 'SMILES']]

In [6]:
db2smiles_dict = dict(zip(db2smiles_df['DrugBank ID'], db2smiles_df['SMILES']))

In [8]:
with open('database/processed/db2smiles_dict.pickle', 'wb') as f:
    pickle.dump(db2smiles_dict, f)

## Create label tables with SMILES

In [9]:
dc = pd.read_csv('data/labels/DC_combined_msi.tsv', sep='\t')
ddi = pd.read_csv('data/labels/TWOSIDES_msi.tsv', sep='\t')

In [10]:
dc_drug_1 = dc['drug_1'].tolist()
dc_drug_2 = dc['drug_2'].tolist()
dc_drug_1_smiles = [db2smiles_dict[drug] if drug in db2smiles_dict.keys() else None for drug in dc_drug_1]
dc_drug_2_smiles = [db2smiles_dict[drug] if drug in db2smiles_dict.keys() else None for drug in dc_drug_2]

dc['drug_1_smiles'] = dc_drug_1_smiles
dc['drug_2_smiles'] = dc_drug_2_smiles

dc.dropna(inplace=True)

In [11]:
dc

Unnamed: 0,drug_1,drug_2,drug_1_smiles,drug_2_smiles
1,DB01104,DB00704,CN[C@H]1CC[C@@H](C2=CC(Cl)=C(Cl)C=C2)C2=CC=CC=C12,[H][C@@]12OC3=C(O)C=CC4=C3[C@@]11CCN(CC3CC3)[C...
2,DB00396,DB00783,[H][C@@]12CC[C@H](C(C)=O)[C@@]1(C)CC[C@@]1([H]...,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@]1([H])C3=C(...
3,DB00531,DB00091,ClCCN(CCCl)P1(=O)NCCCO1,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...
4,DB00531,DB01073,ClCCN(CCCl)P1(=O)NCCCO1,NC1=NC(F)=NC2=C1N=CN2[C@@H]1O[C@H](CO)[C@@H](O...
5,DB00091,DB00563,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,CN(CC1=CN=C2N=C(N)N=C(N)C2=N1)C1=CC=C(C=C1)C(=...
...,...,...,...,...
4338,DB00894,DB00783,[H][C@@]12CCC3=CC(=O)C=C[C@]3(C)[C@@]1([H])CC[...,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@]1([H])C3=C(...
4339,DB00956,DB00397,[H][C@@]12OC3=C(OC)C=CC4=C3[C@@]11CCN(C)[C@]([...,[H][C@@](C)(N)[C@]([H])(O)C1=CC=CC=C1
4340,DB00852,DB00956,CN[C@@H](C)[C@@H](O)C1=CC=CC=C1,[H][C@@]12OC3=C(OC)C=CC4=C3[C@@]11CCN(C)[C@]([...
4341,DB01050,DB00388,CC(C)CC1=CC=C(C=C1)C(C)C(O)=O,CNC[C@H](O)C1=CC(O)=CC=C1


In [12]:
dc.drop(dc[dc['drug_1_smiles'] == '[H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]'].index, inplace=True)
dc.drop(dc[dc['drug_2_smiles'] == '[H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]'].index, inplace=True)

In [13]:
dc.to_csv('data/labels/DC_combined_msi_small.tsv', sep='\t', index=None)

In [14]:
ddi_drug_1 = ddi['drug_1'].tolist()
ddi_drug_2 = ddi['drug_2'].tolist()
ddi_drug_1_smiles = [db2smiles_dict[drug] if drug in db2smiles_dict.keys() else None for drug in ddi_drug_1]
ddi_drug_2_smiles = [db2smiles_dict[drug] if drug in db2smiles_dict.keys() else None for drug in ddi_drug_2]

ddi['drug_1_smiles'] = ddi_drug_1_smiles
ddi['drug_2_smiles'] = ddi_drug_2_smiles

ddi.dropna(inplace=True)

In [15]:
ddi # no change

Unnamed: 0,drug_1,drug_2,drug_1_smiles,drug_2_smiles
0,DB00533,DB00897,CS(=O)(=O)C1=CC=C(C=C1)C1=C(C(=O)OC1)C1=CC=CC=C1,CC1=NN=C2CN=C(C3=CC=CC=C3Cl)C3=C(C=CC(Cl)=C3)N12
1,DB00231,DB00726,CN1C2=C(C=C(Cl)C=C2)C(=NC(O)C1=O)C1=CC=CC=C1,CC(CN(C)C)CN1C2=CC=CC=C2CCC2=CC=CC=C12
2,DB01068,DB01045,[O-][N+](=O)C1=CC2=C(NC(=O)CN=C2C2=CC=CC=C2Cl)...,CO[C@H]1\C=C\O[C@@]2(C)OC3=C(C2=O)C2=C(O)C(\C=...
3,DB01204,DB01101,OCCNCCNC1=CC=C(NCCNCCO)C2=C1C(=O)C1=C(C(O)=CC=...,CCCCCOC(=O)NC1=NC(=O)N(C=C1F)[C@@H]1O[C@H](C)[...
4,DB00423,DB00433,COC1=C(OCC(O)COC(N)=O)C=CC=C1,CN1CCN(CCCN2C3=CC=CC=C3SC3=C2C=C(Cl)C=C3)CC1
...,...,...,...,...
16152,DB00575,DB00363,ClC1=CC=CC(Cl)=C1NC1=NCCN1,CN1CCN(CC1)C1=NC2=CC(Cl)=CC=C2NC2=CC=CC=C12
16153,DB00341,DB00973,OC(=O)COCCN1CCN(CC1)C(C1=CC=CC=C1)C1=CC=C(Cl)C=C1,[H][C@]1(CC[C@H](O)C2=CC=C(F)C=C2)C(=O)N(C2=CC...
16154,DB00996,DB00331,NCC1(CC(O)=O)CCCCC1,CN(C)C(=N)NC(N)=N
16155,DB00321,DB00264,CN(C)CCC=C1C2=CC=CC=C2CCC2=CC=CC=C12,COCCC1=CC=C(OCC(O)CNC(C)C)C=C1


In [16]:
ddi.to_csv('data/labels/TWOSIDES_msi_small.tsv', sep='\t', index=None)

## Pickle chemical features

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from mordred import Calculator, descriptors

In [3]:
with open('database/processed/db2smiles_dict.pickle', 'rb') as f:
    db2smiles_dict = pickle.load(f)

In [4]:
dc = pd.read_csv('data/labels/DC_combined_msi_small.tsv', sep='\t')
ddi = pd.read_csv('data/labels/TWOSIDES_msi_small.tsv', sep='\t')

total_drugs = dc['drug_1'].tolist() + dc['drug_2'].tolist() + ddi['drug_1'].tolist() + ddi['drug_2'].tolist()

In [5]:
total_drugs = list(set(total_drugs))

In [6]:
len(total_drugs)

973

In [16]:
# Fingerprint - ECFP4
ecfp_dict = {}
for i, db_id in enumerate(list(db2smiles_dict.keys())): ###
    smiles = db2smiles_dict[db_id]
    try:
        mol = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=128)
        ecfp_dict[db_id] = np.array(fp)
    except:
        print(i)

[15:28:28] Explicit valence for atom # 2 O, 3, is greater than permitted


121
182
270
301
407
408
418


[15:28:29] Explicit valence for atom # 0 N, 4, is greater than permitted
[15:28:29] Explicit valence for atom # 0 N, 4, is greater than permitted


593
598
674
729
782
784
819
847
854
883
936


[15:28:29] Explicit valence for atom # 0 N, 4, is greater than permitted


995
1111
1150
1176
1192
1194
1234
1261
1277
1333


[15:28:30] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[15:28:31] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[15:28:31] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'


1718
1787


[15:28:31] Explicit valence for atom # 14 N, 5, is greater than permitted


1965


[15:28:31] Explicit valence for atom # 19 O, 3, is greater than permitted


2410
3073
3128
3181


[15:28:32] Explicit valence for atom # 2 O, 3, is greater than permitted
[15:28:32] Explicit valence for atom # 6 N, 4, is greater than permitted
[15:28:32] Explicit valence for atom # 11 N, 4, is greater than permitted
[15:28:33] Explicit valence for atom # 0 O, 3, is greater than permitted


3590
3747


[15:28:33] Explicit valence for atom # 6 Be, 4, is greater than permitted
[15:28:33] Explicit valence for atom # 3 N, 4, is greater than permitted


3801
3873


[15:28:33] Explicit valence for atom # 4 F, 2, is greater than permitted


4064
4453
4498
4515
4516
4523
4525
4534
4539
4546
4547
4550
4551
4552
4554
4556
4557
4559
4562
4564
4567
4570
4572
4573
4575
4576
4577
4578
4581
4584
4585
4586
4588
4591
4592
4593
4594
4596
4597
4598
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4612
4613
4614
4615
4616
4617
4618
4621
4622
4623
4624
4626
4627
4634
4636
4637
4638
4639
4641
4643
4644
4646
4647
4648
4649
4652
4653
4654
4655
4657
4659
4660
4662
4663
4664
4665
4666
4667
4671
4672
4673
4674
4675
4676
4677
4678
4679
4681
4682
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4699
4700
4701
4702
4703
4704
4705
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4718
4719
4723
4724
4725
4726
4727
4729
4731
4733
4735
4736
4737
4740
4747
4750
4753
4755
4758
4759
4760
4766
4767
4768
4769
4770
4771
4772
4773
4774
4776
4777
4778
4780
4781
4785
4786
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4803
4804
4805
4807
4808
4809
4810
4815
4816
4817
4824
4826
4827
4828
4836
4838
4839
4841
4843
4846
4848
4850
4851
4853
4855


[15:28:36] Explicit valence for atom # 13 Be, 3, is greater than permitted


6967
7474
7505
7508
7512
7553
7558
7568
7643
7711
7713
7757
7845
7848
7849
7850
7851
7853
7855
7876
7889
7893
7896
7901
7902
7915
7916
7919
7922
7933
7936
7940
7942
7947
7950
7952
7968
7994
7995
7997
8005
8007
8008
8015
8016
8020
8021
8022
8023
8024
8026
8028
8033
8034
8035
8039
8046
8052
8054
8057
8058
8060
8062
8064
8067
8073
8075
8083
8085
8087
8090
8096
8099
8102
8107
8108
8116
8117
8122
8127
8129
8133
8134
8141
8142
8144
8145
8146
8148
8150
8152
8153
8154
8156
8157
8162
8163
8164


[15:28:38] Explicit valence for atom # 84 N, 4, is greater than permitted


8171
8175
8179
8180
8185
8194
8197
8198
8201
8203
8204
8205
8206
8208
8211
8217
8220
8225
8233
8234
8235
8236
8237
8239
8240
8339
8393
8394
8402
8410
8412
8428
8431
8432


[15:28:38] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[15:28:38] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'


8495
8532
8726
8770
8809
8918
8966
8976
9023
9036


[15:28:40] Explicit valence for atom # 1 Cl, 4, is greater than permitted


9124
9155
9229
9234
9371
9373
9438
9497
9565
9614
9646
9687
9699


[15:28:41] Explicit valence for atom # 0 N, 4, is greater than permitted


9713
9717
9795
9815
9817
9820
9821
9823
9880
9886
9893
9894
9905
9963
9964
9993
9996
10008
10018
10039
10073
10082
10143
10156
10167
10197
10209
10213
10243
10244
10251
10317
10319
10386
10428
10440
10460
10461
10489
10527
10531
10539
10540
10551
10556
10557
10570
10571
10573
10581
10583
10585
10586
10589
10597
10606
10608
10635
10636
10642
10643
10662
10668
10669
10713
10723
10739


[15:28:43] Explicit valence for atom # 5 K, 2, is greater than permitted


10807
10823
10824
10829
10841
10854
10877
10891
10893
10949
10971
10972
10976
11003
11011
11024
11075
11077
11119
11133
11191
11196
11237
11256
11294
11295
11332
11333
11335
11358
11363
11376
11384


In [17]:
len(ecfp_dict)

10737

In [18]:
with open('data/embedding/ecfp_dict.pickle', 'wb') as f:
    pickle.dump(ecfp_dict, f)

In [21]:
# Fingerprints - MACCS
maccs_dict = {}
for i, db_id in enumerate(list(db2smiles_dict.keys())):
    smiles = db2smiles_dict[db_id]
    try:
        mol = Chem.MolFromSmiles(smiles)
        maccs = MACCSkeys.GenMACCSKeys(mol)
        maccs_dict[db_id] = np.array(maccs)
    except:
        print(smiles)

[15:29:12] Explicit valence for atom # 2 O, 3, is greater than permitted


O=C1[O-][Gd+3]234567[O]=C(C[N]2(CC[N]3(CC([O-]4)=O)CC[N]5(CC(=[O]6)NC)CC(=O)[O-]7)C1)NC
nan
nan
nan


[15:29:14] Explicit valence for atom # 0 N, 4, is greater than permitted
[15:29:14] Explicit valence for atom # 0 N, 4, is greater than permitted


[H][N]([H])([H])[Pt](Cl)(Cl)[N]([H])([H])[H]
nan
[H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[Pt]11OC(=O)C(=O)O1
nan
nan
nan
nan
nan
nan
nan


[15:29:17] Explicit valence for atom # 0 N, 4, is greater than permitted


[H][N]([H])([H])[Pt]1(OC(=O)C2(CCC2)C(=O)O1)[N]([H])([H])[H]
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


[15:29:20] Explicit valence for atom # 13 Cl, 5, is greater than permitted


NC1=C(C2=C(N)N=C(N)N=C2C=C1)[Cl](=O)=O


[15:29:21] SMILES Parse Error: syntax error while parsing: OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]
[15:29:21] SMILES Parse Error: Failed parsing SMILES 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]' for input: 'OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]'


OS(O)(O)C1=CC=C(C=C1)C-1=C2\C=CC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC=C(C=C1)S(O)(O)O)C1=CC=C(C=C1)S([O-])([O-])[O-])\C1=CC=C(C=C1)S(O)(O)[O-]


[15:29:21] Explicit valence for atom # 14 N, 5, is greater than permitted


CCC1=C2C=C3N4C(=CC5=[N+]6C(=CC7=[N]8(C)C(=CC(N2[Cu-]468)=C1C)C(CC)=C7C)C(C)=C5CCC(O)=O)C(CCC(O)=O)=C3C


[15:29:23] Explicit valence for atom # 19 O, 3, is greater than permitted


COC1=CC=C2N(C=NC2=C1)[C@H]1O[C@H](CO)[C@@H](O[P@]([OH-])(=O)O[C@H](C)CNC(=O)CC[C@]2(C)[C@@H](CC(N)=O)C3=[N+]4C2=C(C)C2=[N+]5C(=CC6=[N+]7C(=C(C)C8=[N+]([C@]3(C)[C@@](C)(CC(N)=O)[C@@H]8CCC(N)=O)[Co]457)[C@@](C)(CC(N)=O)[C@@H]6CCC(N)=O)C(C)(C)[C@@H]2CCC(N)=O)[C@H]1O
nan
CC1=[O][Ga]2345ON1CCC[C@H]1NC(=O)CNC(=O)[C@H](CO)NC(=O)CNC(=O)[C@H](CCCN(O2)C(C)=[O]3)NC(=O)C(CCCN(O4)C(C)=[O]5)NC1=O


[15:29:25] Explicit valence for atom # 2 O, 3, is greater than permitted
[15:29:25] Explicit valence for atom # 6 N, 4, is greater than permitted


C1=CN(C=N1)[Os++]123[N]4=CC=CC=C4C4=[N]1C=CC=C4.C1=CC(C4=CC=CC=[N]24)=[N]3C=C1


[15:29:27] Explicit valence for atom # 11 N, 4, is greater than permitted


CC1=C(CCC(O)=O)C2=CC3=[N]4C(=CC5=C(C)C(C=C)=C6C=C7C(C)=C(C=C)C8=[N]7[Zn]4(N2C1=C8)N56)C(C)=C3CCC(O)=O


[15:29:27] Explicit valence for atom # 0 O, 3, is greater than permitted
[15:29:27] Explicit valence for atom # 6 Be, 4, is greater than permitted


[O]#C[Re+]1(C#[O])(C#[O])[N]2=CC=CC3=C2C2=C(C=CC=[N]12)C=C3
N[C@@H](C[C@H](O)O[Be](F)(F)F)C(O)=O


[15:29:28] Explicit valence for atom # 3 N, 4, is greater than permitted


[OH2+][Cu-4]([OH2+])([N]1=CNC=C1)([N+]1=CNC=C1)([N+]1=CNC=C1)[N+]1=CNC=C1


[15:29:28] Explicit valence for atom # 4 F, 2, is greater than permitted


F[Al](F)(F)[F-]
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


[15:29:39] Explicit valence for atom # 13 Be, 3, is greater than permitted


CN(CCO[P@](O)(=O)O[P@@](O)(=O)O[Be-](F)(F)F)C1=CC=CC=C1[N+]([O-])=O
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
[57Co+3].[C-]#N.C[C@H](CNC(=O)CC[C@]1(C)[C@@H](CC(N)=O)C2[N-]\C1=C(C)/C1=N/C(=C\C3=N\C(=C(C)/C4=N[C@]2(C)[C@@](C)(CC(N)=O)[C@@H]4CCC(N)=O)\[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)/C(C)(C)[C@@H]1CCC(N)=O)OP([O-])(=O)O[C@@H]1[C@@H](CO)O[C@@H]([C@@H]1O)N1C=[NH]C2=C1C=C(C)C(C)=C2
nan
nan


[15:29:43] Explicit valence for atom # 84 N, 4, is greater than permitted


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
nan
nan


[15:29:45] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[15:29:45] SMILES Parse Error: Failed parsing SMILES 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1' for input: 'OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1'


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


[15:29:48] Explicit valence for atom # 1 Cl, 4, is greater than permitted


O=[Cl]=O
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
[H][N]([H])([H])[Pt]1(OCC(=O)O1)[N]([H])([H])[H]
nan
nan


[15:29:51] Explicit valence for atom # 0 N, 4, is greater than permitted


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
[OH-].[OH-].[OH-].[Mg++].[Cl-].[K++].[K++].[Ca++].[O-]C([O-])=O
nan
nan


[15:29:55] Explicit valence for atom # 5 K, 2, is greater than permitted


nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [22]:
len(maccs_dict)

10737

In [23]:
with open('data/embedding/maccs_dict.pickle', 'wb') as f:
    pickle.dump(maccs_dict, f)

In [7]:
# Descriptor - Mordred
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(db2smiles_dict[db_id]) for db_id in total_drugs]
mordred_df = calc.pandas(mols)

  3%|▎         | 30/973 [00:12<03:48,  4.14it/s]  

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  9%|▊         | 84/973 [00:25<04:26,  3.33it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 14%|█▍        | 135/973 [00:37<05:17,  2.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|█▊        | 181/973 [00:38<00:51, 15.42it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 20%|██        | 197/973 [00:38<00:32, 24.19it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 214/973 [00:38<00:21, 35.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|██▌       | 253/973 [00:56<02:56,  4.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 33%|███▎      | 319/973 [01:06<02:02,  5.34it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▊      | 376/973 [02:39<15:11,  1.53s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 42%|████▏     | 409/973 [02:39<05:39,  1.66it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 44%|████▍     | 430/973 [02:39<02:48,  3.23it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 49%|████▉     | 475/973 [03:10<06:24,  1.29it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 52%|█████▏    | 509/973 [03:10<01:52,  4.14it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 57%|█████▋    | 556/973 [03:11<00:15, 26.62it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 82%|████████▏ | 801/973 [04:54<02:04,  1.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 973/973 [05:57<00:00,  2.72it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [8]:
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,28.388090,23.006121,0,0,43.455183,2.763283,5.446877,43.455183,1.241577,4.527779,...,11.200800,90.052612,494.211610,7.376293,3112,82,212.0,274.0,14.8125,7.055556
1,16.591625,12.700815,0,1,28.036837,2.259061,4.502814,28.036837,1.274402,4.009685,...,9.495294,73.078088,307.214744,6.023819,1427,24,104.0,113.0,7.055556,5.000000
2,9.549067,8.934557,0,1,15.265782,2.302776,4.605551,15.265782,1.174291,3.450865,...,9.149209,43.553997,179.131014,5.971034,256,16,60.0,66.0,5.694444,2.972222
3,24.192311,18.309250,1,1,40.283742,2.522087,5.044174,40.283742,1.299476,4.371883,...,10.619105,67.514774,420.221306,7.003688,2806,59,168.0,205.0,10.152778,6.861111
4,10.675551,9.345840,0,1,17.996296,2.395293,4.66565,17.996296,1.28545,3.567232,...,9.356257,59.342918,211.114319,6.810139,318,17,70.0,80.0,4.444444,3.194444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8.134854,7.770338,1,0,12.675204,2.302776,4.605551,12.675204,1.152291,3.294669,...,9.094144,41.023148,153.042593,8.502366,150,14,52.0,58.0,5.194444,2.472222
969,7.806684,7.343579,0,0,13.098358,2.369838,4.63395,13.098358,1.309836,3.261311,...,9.161465,53.745115,136.038511,9.717036,105,12,52.0,61.0,2.833333,2.222222
970,21.408122,16.810648,0,0,35.997462,2.451538,4.903076,35.997462,1.285624,4.250032,...,10.257344,63.351289,408.133966,7.700641,2174,47,144.0,170.0,10.111111,6.333333
971,30.873016,25.672379,0,1,49.129425,2.644217,5.096767,49.129425,1.259729,4.610203,...,10.820118,94.115817,545.262482,6.990545,4573,72,218.0,265.0,13.798611,8.347222


In [9]:
# change data type to float32
mordred_df = mordred_df.astype(np.float32)

In [10]:
mordred_df.dropna(axis='columns', inplace=True)

In [11]:
mordred_df.dropna(axis='rows', inplace=True)

In [12]:
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,28.388090,23.006121,0.0,0.0,0.0,0.0,67.0,35.0,0.0,0.0,...,8.032360,11.200800,90.052612,494.211609,7.376293,3.112000e+03,82.0,212.0,274.0,7.055555
1,16.591625,12.700815,0.0,1.0,6.0,6.0,51.0,22.0,0.0,0.0,...,7.085901,9.495294,73.078087,307.214752,6.023818,1.427000e+03,24.0,104.0,113.0,5.000000
2,9.549067,8.934557,0.0,1.0,6.0,6.0,30.0,13.0,0.0,0.0,...,0.000000,9.149209,43.553997,179.131012,5.971034,2.560000e+02,16.0,60.0,66.0,2.972222
3,24.192312,18.309250,1.0,1.0,12.0,12.0,60.0,31.0,0.0,0.0,...,0.000000,10.619105,67.514778,420.221313,7.003688,2.806000e+03,59.0,168.0,205.0,6.861111
4,10.675551,9.345840,0.0,1.0,5.0,5.0,31.0,14.0,0.0,0.0,...,6.803505,9.356257,59.342918,211.114319,6.810139,3.180000e+02,17.0,70.0,80.0,3.194444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,8.134853,7.770338,1.0,0.0,6.0,6.0,18.0,11.0,0.0,0.0,...,0.000000,9.094144,41.023148,153.042587,8.502366,1.500000e+02,14.0,52.0,58.0,2.472222
969,7.806684,7.343579,0.0,0.0,9.0,10.0,14.0,10.0,0.0,0.0,...,6.605298,9.161466,53.745113,136.038513,9.717036,1.050000e+02,12.0,52.0,61.0,2.222222
970,21.408121,16.810648,0.0,0.0,12.0,12.0,53.0,28.0,0.0,0.0,...,0.000000,10.257343,63.351288,408.133972,7.700641,2.174000e+03,47.0,144.0,170.0,6.333333
971,30.873016,25.672379,0.0,1.0,6.0,6.0,78.0,39.0,1.0,0.0,...,8.100162,10.820118,94.115814,545.262512,6.990545,4.573000e+03,72.0,218.0,265.0,8.347222


In [29]:
mordred_df.dropna(axis='columns', inplace=True)

In [31]:
mordred_df.dropna(axis='rows', inplace=True)

In [32]:
mordred_df

Unnamed: 0,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,nBridgehead,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,-0.781189,-0.755633,-1.0,-1.0,-1.000000,-1.000000,-0.814346,-0.806268,-1.0,-1.0,...,0.706468,0.806997,-0.561583,-0.804953,-0.864509,-1.000000,-0.685221,-0.742718,-0.700219,-0.824464
1,-0.872114,-0.865094,-1.0,-0.9,-0.647059,-0.657143,-0.859353,-0.880342,-1.0,-1.0,...,0.505394,0.531852,-0.645150,-0.879816,-0.942849,-1.000000,-0.907869,-0.873786,-0.876368,-0.875605
2,-0.926397,-0.905098,-1.0,-0.9,-0.647059,-0.657143,-0.918425,-0.931624,-1.0,-1.0,...,-1.000000,0.476019,-0.790501,-0.931094,-0.945906,-1.000000,-0.938580,-0.927184,-0.927790,-0.926054
3,-0.813529,-0.805522,-0.8,-0.9,-0.294118,-0.314286,-0.834037,-0.829060,-1.0,-1.0,...,-1.000000,0.713154,-0.672539,-0.834575,-0.886091,-1.000000,-0.773512,-0.796116,-0.775711,-0.829302
4,-0.917714,-0.900730,-1.0,-0.9,-0.705882,-0.714286,-0.915612,-0.925926,-1.0,-1.0,...,0.445399,0.509421,-0.712770,-0.918290,-0.897302,-1.000000,-0.934741,-0.915049,-0.912473,-0.920525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,-0.937298,-0.917465,-0.8,-1.0,-0.647059,-0.657143,-0.952180,-0.943020,-1.0,-1.0,...,-1.000000,0.467135,-0.802961,-0.941539,-0.799283,-1.000000,-0.946257,-0.936893,-0.936543,-0.938493
969,-0.939827,-0.921998,-1.0,-1.0,-0.470588,-0.428571,-0.963432,-0.948718,-1.0,-1.0,...,0.403290,0.477996,-0.740329,-0.948346,-0.728925,-1.000000,-0.953935,-0.936893,-0.933260,-0.944713
970,-0.834989,-0.821440,-1.0,-1.0,-0.294118,-0.314286,-0.853727,-0.846154,-1.0,-1.0,...,-1.000000,0.654791,-0.693037,-0.839414,-0.845721,-1.000000,-0.819578,-0.825243,-0.814004,-0.842433
971,-0.762035,-0.727312,-1.0,-0.9,-0.647059,-0.657143,-0.783404,-0.783476,0.0,-1.0,...,0.720873,0.745582,-0.541579,-0.784515,-0.886853,-1.000000,-0.723608,-0.735437,-0.710066,-0.792329


In [34]:
mordred_df.to_csv('mordred_df.csv', sep='\t', index=None)

In [35]:
# normalize between -1 and 1
def normalize_df(df):
    min_vals = df.min()
    max_vals = df.max()
    normalized_df = 2 * (df - min_vals) / (max_vals - min_vals) - 1
    return normalized_df

In [36]:
mordred_df = normalize_df(mordred_df)

In [37]:
mordred_dict = {}
for i, db_id in enumerate(total_drugs):
    mordred_dict[db_id] = mordred_df.iloc[i].values

In [38]:
with open('data/embedding/mordred_dict.pickle', 'wb') as f:
    pickle.dump(mordred_dict, f)

In [2]:
with open('data/embedding/mordred_dict.pickle', 'rb') as f:
    mordred_dict = pickle.load(f)