In [1]:
import numpy as np
import torch

#check GPU
print('CUDA available: {}'.format(torch.cuda.is_available()))
print('Current GPU: {}'.format(torch.cuda.get_device_name(torch.cuda.current_device())))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CUDA available: True
Current GPU: GeForce RTX 2080 SUPER


In [2]:
import deepchem as dc

loaded, datasets, transformers = dc.utils.load_dataset_from_disk('data/combined/cisplatin')
train_dataset, valid_dataset, test_dataset = datasets
print('train/val/test split: {}/{}/{}'.format(
    len(train_dataset), len(valid_dataset), len(test_dataset)))

num_node_features = train_dataset.X[0].num_node_features
num_edge_features = train_dataset.X[0].num_edge_features
num_classes = train_dataset.y[0].shape[-1]
print(num_node_features, num_classes)

train/val/test split: 490/61/61
79 1


In [3]:
from torch_geometric.data import Data, DataLoader

def get_data_loader(dc_dataset, batch_size=64, shuffle=True):
    ds = [x.to_pyg_graph() for x in dc_dataset.X]
    for i in range(len(ds)):
        ds[i].y = torch.from_numpy(dc_dataset.y[i].reshape(1, -1))
        ds[i].w = torch.from_numpy(dc_dataset.w[i].reshape(1, -1))
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

test_loader = get_data_loader(test_dataset, batch_size=64)



In [4]:
from sklearn.metrics import roc_auc_score

def test(model, loader):
    model.eval()

    outs = []
    ys = []
    for data in loader:  # Iterate in batches over the training/test dataset.
        data = data.to(device)
        
        #out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
        out = model(data.x, data.edge_index, data.edge_attr, data.batch)
        
        outs.append(out.detach().cpu().numpy())
        ys.append(data.y.detach().cpu().numpy())
    
    pred = np.concatenate(outs, axis=0)
    y = np.concatenate(ys, axis=0)
    
    score = roc_auc_score(y, pred)
    return score

#model = torch.load('trained_models/AFP_tl_cisplatin_0.9082')
model = torch.load('trained_models/AFP_tl_cisplatin_test')
te_score = test(model, test_loader)
print('Test: {:.5f}'.format(te_score))

Test: 0.92217


In [5]:
from featurizer import MolGraphConvFeaturizer

featurizer = MolGraphConvFeaturizer(
    use_edges=True, use_chirality=True, use_partial_charge=True
)

In [6]:
from rdkit import Chem
from utils import validate_mols

def get_valid_smiles(data):
    smiles = list(data['smiles'])
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    valid_mols, valid_index = validate_mols(mols)
    valid_smiles = [Chem.MolToSmiles(mol) for mol in valid_mols]
    return valid_smiles, valid_index

def get_valid_mols(data):
    smiles = list(data['smiles'])
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    valid_mols, valid_index = validate_mols(mols)
    return valid_mols, valid_index

def get_valid_features(molecules, featurizer):
    valid_features = []
    valid_index = []
    for i, mol in enumerate(molecules):
        try:
            valid_features.append(featurizer._featurize(mol))
            valid_index.append(i)
        except Exception as e:
            print("Failed to featurize datapoint %d, %s.", i, Chem.MolToSmiles(mol))
            print("Exception message: {}".format(e))

    return np.asarray(valid_features), np.array(valid_index)

def featurize_data(data, featurizer):
    valid_smiles, valid_index = get_valid_smiles(data)
    valid_mols, valid_index = get_valid_mols(data)
    valid_data = data.iloc[valid_index]
    valid_features, valid_index = get_valid_features(valid_mols, featurizer)
    valid_data = valid_data.iloc[valid_index]
    return valid_features, valid_data

In [7]:
import pandas as pd

fda_approved_data = pd.read_csv('data/tl/approved_drugs_valid_parent.csv')
fda_approved_data

Unnamed: 0,chembl_id,pref_name,smiles
0,CHEMBL2,PRAZOSIN,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4
1,CHEMBL3,NICOTINE,CN1CCC[C@H]1c2cccnc2
2,CHEMBL4,OFLOXACIN,CC1COc2c(N3CCN(C)CC3)c(F)cc4C(=O)C(=CN1c24)C(=O)O
3,CHEMBL5,NALIDIXIC ACID,CCN1C=C(C(=O)O)C(=O)c2ccc(C)nc12
4,CHEMBL6,INDOMETHACIN,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c3ccc(Cl)cc3
...,...,...,...
2294,CHEMBL3833405,INOSINE PRANOBEX,CC(O)CN(C)C.CC(O)CN(C)C.CC(O)CN(C)C.OC[C@H]1O[...
2295,CHEMBL3833406,PAPAVERETUM,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC.COc4ccc5...
2296,CHEMBL3833408,MAGALDRATE,[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-...
2297,CHEMBL3833409,HYDROTALCITE,[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-].[OH-...


In [8]:
valid_fda_approved_features, valid_fda_approved_data = featurize_data(
    fda_approved_data, featurizer)

Failed to featurize datapoint %d, %s. 607 [I-].[K+]
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 815 [F-].[Na+]
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 1458 O
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 1462 N
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 1541 [Cl-].[Na+]
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 1565 [Ca+2].[Cl-].[Cl-]
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint %d, %s. 1566 [Cl-].[Cl-].[Zn+2]
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to 

In [9]:
def predict(model, loader):
    model.eval()

    outs = []
    for data in loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.edge_attr, data.batch)
        outs.append(torch.sigmoid(out).detach().cpu().numpy())
    
    pred = np.concatenate(outs, axis=0)
    return pred

loader = DataLoader([x.to_pyg_graph() for x in valid_fda_approved_features], 
           batch_size=16, shuffle=False)
pred = predict(model, loader)

In [10]:
res_fda_approved_data = valid_fda_approved_data.copy()
res_fda_approved_data.insert(3, 'p', pred)
res_fda_approved_data_sorted = res_fda_approved_data.sort_values(by=['p'], ascending=False)
res_fda_approved_data_sorted

Unnamed: 0,chembl_id,pref_name,smiles,p
945,CHEMBL14687,PROPANOL,CCCO,0.920851
227,CHEMBL545,ALCOHOL,CCO,0.917556
255,CHEMBL582,ISOPROPYL ALCOHOL,CC(C)O,0.917449
1926,CHEMBL1651998,TERPIN,CC(C)(O)C1CCC(C)(O)CC1,0.917307
74,CHEMBL109,VALPROIC ACID,CCCC(CCC)C(=O)O,0.916914
...,...,...,...,...
466,CHEMBL895,NALBUPHINE,O[C@H]1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5O[C@@H]1[C@...,0.045223
1454,CHEMBL1096885,VALRUBICIN,CCCCC(=O)OCC(=O)[C@@]1(O)C[C@H](O[C@H]2C[C@H](...,0.045069
1481,CHEMBL1186579,METHYLNALTREXONE,C[N+]1(CC2CC2)CC[C@]34[C@H]5Oc6c(O)ccc(C[C@@H]...,0.044497
1630,CHEMBL1200969,DUTASTERIDE,C[C@]12CC[C@H]3[C@@H](CC[C@H]4NC(=O)C=C[C@]34C...,0.044427


In [11]:
def check_data(train_dataset, valid_dataset, test_dataset, res_data):
    data = res_data.copy()
    data.insert(len(data.columns), 'dataset', np.full_like(data['p'], np.nan))
    data.insert(len(data.columns), 'label', np.full_like(data['p'], np.nan))
    
    dataset_col_loc = len(data.columns) - 2
    labelcol_loc = len(data.columns) - 1
    
    for i in range(len(data)):
        curr_smiles = data.iloc[i]['smiles']
        
        for j, s in enumerate(train_dataset.ids):
            if s == curr_smiles:
                data.iloc[i, dataset_col_loc] = 'train'
                if train_dataset.y[j][0] == 1.:
                    data.iloc[i, labelcol_loc] = 1
                else:
                    data.iloc[i, labelcol_loc] = 0
                    
        for j, s in enumerate(valid_dataset.ids):
            if s == curr_smiles:
                data.iloc[i, dataset_col_loc] = 'validation'
                if valid_dataset.y[j][0] == 1.:
                    data.iloc[i, labelcol_loc] = 1
                else:
                    data.iloc[i, labelcol_loc] = 0
                    
        for j, s in enumerate(test_dataset.ids):
            if s == curr_smiles:
                data.iloc[i, dataset_col_loc] = 'test'
                if test_dataset.y[j][0] == 1.:
                    data.iloc[i, labelcol_loc] = 1
                else:
                    data.iloc[i, labelcol_loc] = 0
                    
    return data

res_fda_data = check_data(
    train_dataset, valid_dataset, test_dataset, res_fda_approved_data_sorted)

In [12]:
res_fda_data_sorted = res_fda_data.sort_values(by=['dataset', 'p'], ascending=False)
res_fda_data_sorted

Unnamed: 0,chembl_id,pref_name,smiles,p,dataset,label
175,CHEMBL467,HYDROXYUREA,NC(=O)NO,0.774460,validation,0.0
717,CHEMBL1372,OXIGLUTATIONE,N[C@@H](CCC(=O)N[C@@H](CSSC[C@H](NC(=O)CC[C@H]...,0.325363,validation,0.0
1241,CHEMBL307145,PYROGALLOL,Oc1cccc(O)c1O,0.246401,validation,0.0
1027,CHEMBL55400,PROFLAVINE,Nc1ccc2cc3ccc(N)cc3nc2c1,0.075957,validation,0.0
1536,CHEMBL1200559,LACTIC ACID,CC(O)C(=O)O,0.890544,train,0.0
...,...,...,...,...,...,...
466,CHEMBL895,NALBUPHINE,O[C@H]1CC[C@@]2(O)[C@H]3Cc4ccc(O)c5O[C@@H]1[C@...,0.045223,,
1454,CHEMBL1096885,VALRUBICIN,CCCCC(=O)OCC(=O)[C@@]1(O)C[C@H](O[C@H]2C[C@H](...,0.045069,,
1481,CHEMBL1186579,METHYLNALTREXONE,C[N+]1(CC2CC2)CC[C@]34[C@H]5Oc6c(O)ccc(C[C@@H]...,0.044497,,
1630,CHEMBL1200969,DUTASTERIDE,C[C@]12CC[C@H]3[C@@H](CC[C@H]4NC(=O)C=C[C@]34C...,0.044427,,


In [13]:
res_fda_data_sorted.to_csv('results/cisplatin_fda_pred-210725.csv', index=False)