In [1]:
import numpy as np
import deepchem as dc
import pandas as pd
from rdkit import Chem



In [2]:
clintox_data = pd.read_csv('data/clintox/clintox.csv')

In [3]:
not_approved = clintox_data['FDA_APPROVED'] == 0
print(np.sum(not_approved))

94


In [4]:
from utils import validate_smiles

sel_clintox_smiles = clintox_data[not_approved]['smiles']
_, sel_clintox_smiles_unique = validate_smiles(sel_clintox_smiles)

Number of valid mols: 94, Number of discarded mols: 0
Number of valid mols: 94, Number of unique mols: 94


In [5]:
tox21_data = pd.read_csv('data/tox21/tox21.csv')

In [6]:
tox21_tasks = [
    'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
    'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
]
num_tox = np.nan_to_num(np.array(tox21_data[tox21_tasks])).sum(axis=-1)

In [7]:
sel_tox21_smiles = tox21_data[num_tox >= 4]['smiles']
_, sel_tox21_smiles_unique = validate_smiles(sel_tox21_smiles)

Number of valid mols: 400, Number of discarded mols: 0
Number of valid mols: 400, Number of unique mols: 400


In [8]:
sel_smiles = list(sel_tox21_smiles_unique) + list(sel_clintox_smiles_unique)
_, sel_smiles_unique = validate_smiles(sel_smiles)

Number of valid mols: 494, Number of discarded mols: 0
Number of valid mols: 494, Number of unique mols: 490


In [9]:
amg_data = pd.read_csv('data/tl/amg_data-210723.csv')
_, amg_smiles_unique = validate_smiles(amg_data['smiles'])

Number of valid mols: 70, Number of discarded mols: 0
Number of valid mols: 70, Number of unique mols: 69


In [10]:
cisplatin_data = pd.read_csv('data/tl/cisplatin_data-210723.csv')
_, cisplatin_smiles_unique = validate_smiles(cisplatin_data['smiles'])

Number of valid mols: 90, Number of discarded mols: 0
Number of valid mols: 90, Number of unique mols: 79


In [11]:
ototoxicity_data = pd.read_csv('data/tl/ototoxicity_data-210723.csv')
_, ototoxicity_smiles_unique = validate_smiles(ototoxicity_data['smiles'])

Number of valid mols: 33, Number of discarded mols: 0
Number of valid mols: 33, Number of unique mols: 29


In [12]:
exp_neg_data = pd.read_csv('data/tl/experiment_negative_data.csv')
_, exp_neg_smiles_unique = validate_smiles(exp_neg_data['smiles'])

Number of valid mols: 21, Number of discarded mols: 0
Number of valid mols: 21, Number of unique mols: 20


In [13]:
_, all_smiles_unique = validate_smiles(
    np.concatenate([sel_smiles_unique, amg_smiles_unique, 
                    cisplatin_smiles_unique, ototoxicity_smiles_unique,
                    exp_neg_smiles_unique])
)

Number of valid mols: 687, Number of discarded mols: 0
Number of valid mols: 687, Number of unique mols: 667


In [14]:
all_smiles_unique = all_smiles_unique.reshape(-1, 1)

In [15]:
tl_data = pd.DataFrame(np.concatenate([all_smiles_unique, 
                        np.empty_like(all_smiles_unique),
                        np.empty_like(all_smiles_unique)], axis=-1),
                      columns=['smiles', 'amg', 'cisplatin'])
tl_data

Unnamed: 0,smiles,amg,cisplatin
0,Br.Cc1ccc(C(=O)Cn2c3c(sc2=N)CCCC3)cc1,,
1,Brc1c2ccccc2c(Br)c2ccccc12,,
2,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,,
3,C#C[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,,
4,C#C[C@]1(O)C=C[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]...,,
...,...,...,...
662,c1ccc2cc3c(ccc4ccccc43)cc2c1,,
663,c1ccc2nccnc2c1,,
664,c1ccc2sc(SNC3CCCCC3)nc2c1,,
665,c1ccc2sc(SSN3CCOCC3)nc2c1,,


In [16]:
for i in range(len(tl_data)):
    curr_smiles = tl_data['smiles'].iloc[i]
    if curr_smiles in amg_smiles_unique:
        tl_data.iloc[i]['amg'] = 1.
    if curr_smiles in cisplatin_smiles_unique:
        tl_data.iloc[i]['cisplatin'] = 1.
    if (curr_smiles in sel_smiles_unique) or (curr_smiles in ototoxicity_smiles_unique) or (curr_smiles in exp_neg_smiles_unique):
        tl_data.iloc[i]['amg'] = 0.
        tl_data.iloc[i]['cisplatin'] = 0.
tl_data

Unnamed: 0,smiles,amg,cisplatin
0,Br.Cc1ccc(C(=O)Cn2c3c(sc2=N)CCCC3)cc1,,1.0
1,Brc1c2ccccc2c(Br)c2ccccc12,0.0,0.0
2,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,0.0,0.0
3,C#C[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,0.0,0.0
4,C#C[C@]1(O)C=C[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]...,0.0,0.0
...,...,...,...
662,c1ccc2cc3c(ccc4ccccc43)cc2c1,0.0,0.0
663,c1ccc2nccnc2c1,1.0,1.0
664,c1ccc2sc(SNC3CCCCC3)nc2c1,0.0,0.0
665,c1ccc2sc(SSN3CCOCC3)nc2c1,0.0,0.0


In [17]:
tl_data.to_csv('data/tl/combined_data.csv', index=False)

In [18]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/tl/combined_data.csv')
data

Unnamed: 0,smiles,amg,cisplatin
0,Br.Cc1ccc(C(=O)Cn2c3c(sc2=N)CCCC3)cc1,,1.0
1,Brc1c2ccccc2c(Br)c2ccccc12,0.0,0.0
2,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,0.0,0.0
3,C#C[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,0.0,0.0
4,C#C[C@]1(O)C=C[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]...,0.0,0.0
...,...,...,...
662,c1ccc2cc3c(ccc4ccccc43)cc2c1,0.0,0.0
663,c1ccc2nccnc2c1,1.0,1.0
664,c1ccc2sc(SNC3CCCCC3)nc2c1,0.0,0.0
665,c1ccc2sc(SSN3CCOCC3)nc2c1,0.0,0.0


In [19]:
data_amg = data[['smiles', 'amg']].dropna()
data_amg

Unnamed: 0,smiles,amg
1,Brc1c2ccccc2c(Br)c2ccccc12,0.0
2,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,0.0
3,C#C[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,0.0
4,C#C[C@]1(O)C=C[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]...,0.0
5,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=C(CCC(=O)C4)[C@...,0.0
...,...,...
662,c1ccc2cc3c(ccc4ccccc43)cc2c1,0.0
663,c1ccc2nccnc2c1,1.0
664,c1ccc2sc(SNC3CCCCC3)nc2c1,0.0
665,c1ccc2sc(SSN3CCOCC3)nc2c1,0.0


In [20]:
data_amg.to_csv('data/tl/combined_amg_data.csv', index=False)

In [21]:
data_cisplatin = data[['smiles', 'cisplatin']].dropna()
data_cisplatin

Unnamed: 0,smiles,cisplatin
0,Br.Cc1ccc(C(=O)Cn2c3c(sc2=N)CCCC3)cc1,1.0
1,Brc1c2ccccc2c(Br)c2ccccc12,0.0
2,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@...,0.0
3,C#C[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,0.0
4,C#C[C@]1(O)C=C[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]...,0.0
...,...,...
662,c1ccc2cc3c(ccc4ccccc43)cc2c1,0.0
663,c1ccc2nccnc2c1,1.0
664,c1ccc2sc(SNC3CCCCC3)nc2c1,0.0
665,c1ccc2sc(SSN3CCOCC3)nc2c1,0.0


In [22]:
data_cisplatin.to_csv('data/tl/combined_cisplatin_data.csv', index=False)

In [23]:
data_amg.to_csv('data/combined/amg/combined_amg_data.csv.gz', 
                index=False, compression='gzip')
data_cisplatin.to_csv('data/combined/cisplatin/combined_cisplatin_data.csv.gz', 
                      index=False, compression='gzip')