In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from MSI.load_msi_data import LoadData
import sqlite3
import ast

TWOSIDES processing code 추가해야 함.

## Process database (DCDB, C_DCDB, TWOSIDES)

### DCDB: Drug Combination Database

In [2]:
dcdb = pd.read_csv('database/dcdb/dcdb.txt', sep='\t')
dcdb_id = pd.read_csv('database/dcdb/dcdb_components_identifier.txt', sep='\t')
dc2dcu = pd.read_csv('database/dcdb/DC_TO_DCU.txt', sep='\t')
dcu_usage = pd.read_csv('database/dcdb/DC_USAGE.txt', sep='\t')

In [3]:
print(len(dcdb))
dcdb.head()

1363


Unnamed: 0,DrugCombination_ID,Components_Name,Componets_ID
0,DC000348,Bismuth Subsalicylate; Metronidazole; Tetracyc...,DCC0187/DCC0235/DCC0338
1,DC000349,Brimonidine; Timolol,DCC0072/DCC0106
2,DC000350,Betamethasone; Calcipotriol,DCC0095/DCC0358
3,DC000351,Betamethasone; Clotrimazole,DCC0033/DCC0095
4,DC000352,Cerulenin; Levodopa,DCC0274/DCC0326


In [4]:
# leave only "EFFICACIOUS" combinations
dc2dcu_dict = {}
for i in tqdm(range(len(dc2dcu))):
    dc = dc2dcu['DC_ID'][i]
    dcu = dc2dcu['DCU_ID'][i]
    if dc not in dc2dcu_dict:
        dc2dcu_dict[dc] = set([dcu])
    else:
        dc2dcu_dict[dc].add(dcu)

efficacious_dcu = set()
for i in tqdm(range(len(dcu_usage))):
    dcu = dcu_usage['DCU_ID'][i]
    if dcu_usage['EFFICACY'][i] == 'Efficacious':
        efficacious_dcu.add(dcu)

efficacious_dc = set()
for key in dc2dcu_dict.keys():
    if dc2dcu_dict[key].issubset(efficacious_dcu):
        efficacious_dc.add(key)

100%|██████████| 1793/1793 [00:00<00:00, 65327.64it/s]
100%|██████████| 1813/1813 [00:00<00:00, 96621.09it/s]


In [5]:
dcdb = dcdb.loc[dcdb['DrugCombination_ID'].isin(efficacious_dc)]
print(len(dcdb))

1037


In [6]:
dcdb_id.head()

Unnamed: 0,DCC_ID,Name,CAS_Number,BindingDB,ChEBI,DrugBank,KEGG Compound,KEGG Drug,PDB,PharmGKB,PubChem Compound,PubChem Substance
0,DCC1838,Methyclothiazide,CAS:135-07-9,,,DB00232,,D00656,,,,7847722.0
1,DCC0413,Garenoxacin,CAS:194804-75-6,,,,,D02540,,,124093.0,
2,DCC0520,Betaine,CAS:107-43-7,,,DB01494,,D07523,,,247.0,
3,DCC0639,Atrasentan,CAS:173937-91-2,,,,,D03009,,,17397165.0,
4,DCC0029,Lovastatin,CAS:75330-75-5,,40303.0,DB00227,C07074,D00359,803.0,PA450272,53232.0,46508223.0


In [7]:
dcdb_dict = dict(zip(dcdb_id['DCC_ID'], dcdb_id['DrugBank']))
bad_keys = []
for key, value in dcdb_dict.items():
    if type(value) != str:
        bad_keys.append(key)
    if type(value) == str and not value.startswith('DB'):
        bad_keys.append(key)
print(len(bad_keys))

for key in bad_keys:
    del dcdb_dict[key]

171


In [8]:
dcdb_lst = []
for x in tqdm(dcdb['Componets_ID']):
    dc_ids = x.split('/')
    db_ids = []
    for id_ in dc_ids:
        db_id = dcdb_dict.get(id_) # return None if not exist in the dictionary
        db_ids.append(db_id)
    if (np.nan not in db_ids) and (None not in db_ids):
        dcdb_lst.append(set(db_ids))
print(len(dcdb_lst))

100%|██████████| 1037/1037 [00:00<00:00, 184825.28it/s]

825





In [9]:
dcdb_lst_final = []
for x in dcdb_lst:
    if x not in dcdb_lst_final:
        dcdb_lst_final.append(x)
print(len(dcdb_lst_final))

825


In [10]:
# Leave drug combinations with only 2 drugs
dcdb_lst_dual = []
for x in dcdb_lst_final:
    if len(x) == 2:
        dcdb_lst_dual.append(list(x))
print(len(dcdb_lst_dual))

598


In [11]:
pd.DataFrame(dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('database/processed/DCDB_dual.tsv', sep='\t', index=None)

In [12]:
dcdb_dual = pd.read_csv('database/processed/DCDB_dual.tsv', sep='\t')
dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB00373,DB00484
1,DB02300,DB00443
2,DB00257,DB00443
3,DB01034,DB01235
4,DB00983,DB01222


### C-DCDB: Continuous Drug Combination Database

In [13]:
con = sqlite3.connect('database/c_dcdb/c_dcdb.sqlite')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('aact_combs',), ('aact_combs_with_identifiers',), ('all_combs_unormalized',), ('conditions',), ('design_group',), ('mesh_terms',), ('orangebook_combs',), ('patents_ipc',), ('patents_metadata',), ('references',), ('transformed_patents_drug',), ('trials',), ('web_preview',)]


In [14]:
df = pd.read_sql_query("SELECT * from all_combs_unormalized", con)
df.head()

Unnamed: 0,index,drugs,drugbank_identifiers,pubchem_identifiers,source_id,source
0,0,"[[""foscarnet"", ""Foscavir""], [""ganciclovir"", ""C...","[""DB00529"", ""DB01004""]","[""-1"", ""-1""]",NCT00000134,clinicaltrials.gov
1,1,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
2,2,"[[""ganciclovir""], [""foscarnet"", ""Foscavir""]]","[""DB01004"", ""DB00529""]","[""-1"", ""-1""]",NCT00000136,clinicaltrials.gov
3,3,"[[""cycloserine""], [""clozapine""]]","[""DB00260"", ""DB00363""]","[""CID6234"", ""CID2818""]",NCT00000372,clinicaltrials.gov
4,4,"[[""PTH protein, human"", ""teriparatide""], [""ale...","[""DB06285"", ""DB00630""]","[""CID16133850"", ""CID2088""]",NCT00000400,clinicaltrials.gov


In [15]:
c_dcdb_lst = []
for x in df['drugbank_identifiers']:
    x = ast.literal_eval(x)
    c_dcdb_lst.append(set(x))
print(len(c_dcdb_lst))

43865


In [16]:
c_dcdb_lst_final = []
for x in c_dcdb_lst:
    flag = 0
    for y in x:
        if not y.startswith('DB'):
            flag += 1
    if flag != 0:
        continue

    if x not in c_dcdb_lst_final:
        c_dcdb_lst_final.append(x)

print(len(c_dcdb_lst_final))

15336


In [17]:
# Leave drug combinations with only 2 drugs
c_dcdb_lst_dual = []
for x in c_dcdb_lst_final:
    if len(x) == 2:
        c_dcdb_lst_dual.append(list(x))
print(len(c_dcdb_lst_dual))

9092


In [18]:
pd.DataFrame(c_dcdb_lst_dual, columns=['drug_1', 'drug_2']).to_csv('database/processed/C_DCDB_dual.tsv', sep='\t', index=None)
c_dcdb_dual = pd.read_csv('database/processed/C_DCDB_dual.tsv', sep='\t')
c_dcdb_dual.head()

Unnamed: 0,drug_1,drug_2
0,DB00529,DB01004
1,DB00363,DB00260
2,DB06285,DB00630
3,DB01104,DB00704
4,DB00375,DB00227


### TWOSIDES

In [2]:
from tdc.multi_pred import DDI

In [3]:
data = DDI(name='TWOSIDES')

Found local copy...
Loading...
Done!


In [4]:
data.get_data()

Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2,Y
0,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,1024
1,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,767
2,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,79
3,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,25
4,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,85
...,...,...,...,...,...
4649436,CID000003461,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CID000003954,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...,1008
4649437,CID000003461,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CID000003954,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...,769
4649438,CID000003461,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CID000003954,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...,930
4649439,CID000003461,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,CID000003954,CN(C)C(=O)C(CC[NH+]1CCC(CC1)(C2=CC=C(C=C2)Cl)O...,72


In [5]:
twosides = data.get_data()[['Drug1_ID', 'Drug2_ID']]
twosides.drop_duplicates(inplace=True)

In [6]:
len(twosides)

63473

In [7]:
twosides.head()

Unnamed: 0,Drug1_ID,Drug2_ID
0,CID000002173,CID000003345
56,CID000005206,CID000009433
68,CID000003929,CID000150610
92,CID000001302,CID000005064
226,CID000005267,CID000010631


In [8]:
db_table = pd.read_csv('database/drugbank_all_drug_links.csv')

In [9]:
db_table = db_table[['DrugBank ID', 'PubChem Compound ID']]

In [10]:
db_table.dropna(inplace=True)

In [11]:
db_table['PubChem Compound ID'] = db_table['PubChem Compound ID'].apply(lambda x: str(int(x)))

In [12]:
db_table.head()

Unnamed: 0,DrugBank ID,PubChem Compound ID
5,DB00006,16129704
13,DB00014,5311128
25,DB00027,45267103
47,DB00050,25074887
74,DB00080,16134395


In [40]:
db_table[db_table['PubChem Compound ID'] == '4601']

Unnamed: 0,DrugBank ID,PubChem Compound ID
1152,DB01173,4601


<span style="color:red">TODO - fill in TWOSIDES processing code</span>

## Create labels (Drug Combination, Drug-Drug Interaction)

In [5]:
'''DataLoader'''
dataloader = LoadData()

# get_dict
drug_id2name, drug_name2id = dataloader.get_dict(type='drug')
# protein_id2name, protein_name2id = dataloader.get_dict(type='protein')
# indication_id2name, indication_name2id = dataloader.get_dict(type='indication')
# biof_id2name, biof_name2id = dataloader.get_dict(type='biological_function')

Check Common Drugs Between MSI and DCDB / C_DCDB / TWOSIDES_DDI

In [6]:
msi_drug_list = list(drug_id2name.keys())

dcdb_dual = pd.read_csv('database/DCDB_dual.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('database/C_DCDB_dual.tsv', sep='\t')
twosides_ddi = pd.read_csv('database/TWOSIDES_dual.tsv', sep='\t')

In [7]:
def leave_common_drugs(df):
    count = 0
    for row in df.iterrows():
        a, b = row[1][0], row[1][1]
        if a in msi_drug_list and b in msi_drug_list:
            count += 1
    print(f'Original number of rows: {len(df)}')
    print(f'Filtered number of rows: {count}')
    return df[df['drug_1'].isin(msi_drug_list) & df['drug_2'].isin(msi_drug_list)]

In [8]:
dcdb_dual = leave_common_drugs(dcdb_dual)
c_dcdb_dual = leave_common_drugs(c_dcdb_dual)
twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 598
Filtered number of rows: 455
Original number of rows: 9092
Filtered number of rows: 4221
Original number of rows: 18530
Filtered number of rows: 15710


Save DCDB, C_DCDB

In [9]:
dcdb_dual.to_csv('data/labels/DCDB_msi.tsv', sep='\t', index=None)
c_dcdb_dual.to_csv('data/labels/C_DCDB_msi.tsv', sep='\t', index=None)

Combine DCDB, C_DCDB

In [10]:
dcdb_dual = pd.read_csv('data/labels/DCDB_msi.tsv', sep='\t')
c_dcdb_dual = pd.read_csv('data/labels/C_DCDB_msi.tsv', sep='\t')

dcdb_drug_set = []
for row in dcdb_dual.iterrows():
    dcdb_drug_set.append(set([row[1][0], row[1][1]]))
c_dcdb_drug_set = []
for row in c_dcdb_dual.iterrows():
    c_dcdb_drug_set.append(set([row[1][0], row[1][1]]))

not_common_idx = []
for i in range(len(dcdb_drug_set)):
    if dcdb_drug_set[i] not in c_dcdb_drug_set:
        not_common_idx.append(i)

dc = pd.concat([c_dcdb_dual, dcdb_dual.iloc[not_common_idx]], axis=0)
dc.reset_index(drop=True, inplace=True)

dc.to_csv('data/labels/DC_combined_msi.tsv', sep='\t', index=None)

Leave unique Drug-Drug Interaction pair (DDI - DC)

In [11]:
dc = pd.read_csv('data/labels/DC_combined_msi.tsv', sep='\t')

twosides_ddi = leave_common_drugs(twosides_ddi)

Original number of rows: 15710
Filtered number of rows: 15710


In [12]:
def leave_unique_pair(dc, ddi):
    dc_set_list = []
    for row in dc.iterrows():
        dc_set_list.append(set([row[1][0], row[1][1]]))
    ddi_set_list = []
    for row in ddi.iterrows():
        ddi_set_list.append(set([row[1][0], row[1][1]]))
    not_common_idx = []
    for i in range(len(ddi_set_list)):
        if ddi_set_list[i] not in dc_set_list:
            not_common_idx.append(i)
    temp = ddi.iloc[not_common_idx]
    return temp.reset_index(drop=True)

In [13]:
twosides_ddi = leave_unique_pair(dc, twosides_ddi)
print(f"Number of TWOSIDES pairs: {len(twosides_ddi)}")

Number of TWOSIDES pairs: 15300


In [14]:
twosides_ddi.to_csv('data/labels/TWOSIDES_msi.tsv', sep='\t', index=None)