In [65]:
###
# File name: Preprocess.ipynb (jupyter notebook)
# Description: load raw data and preprocess
# Created on: 2024-05-01
# Modification History
#   - 2024-05-20: GLEE, Integrated preprocessing modules into a Data class
# Version: 0.1
###

## Load data

In [66]:
import pubchempy as pcp
import requests
import json
import numpy as np
import pandas as pd
pd.set_option('display.max_rows',200,'display.max_columns',50)
import csv
import time
import pickle
import os
import sys
import pubchempy as pcp
import Levenshtein as lvs

from tqdm import tqdm
sys.path.append(os.path.join(RDConfig.RDContribDir,'SA_Score'))
from sascorer import calculateScore
from sklearn.model_selection import train_test_split
from scipy import sparse as sps

In [67]:
root = "/home2/glee/Drug_Discovery_Research"
data_path = os.path.join(root, "data")

In [None]:
# rawdata = pd.read_csv(os.path.join(data_path, "preprocessed/golden_set_20240520.csv"))

In [68]:
cols_required = ["PubChem CID", "PubChem SID", "BindingDB Ligand Name", "Ligand SMILES", "Ligand InChI", "UniProt (SwissProt) Primary ID of Target Chain", "Target Name", "UniProt (SwissProt) Entry Name of Target Chain", "BindingDB Target Chain Sequence", "PubChem AID", "Ki (nM)", "Kd (nM)", "IC50 (nM)", "EC50 (nM)"]
dict_to_convert_colnames = {"PubChem CID": "C_cid",
                            "PubChem SID": "C_sid",
                            "BindingDB Ligand Name": "C_name",
                            "Ligand SMILES": "C_seq_lig_smiles",
                            "Ligand InChI": "C_seq_inchi",
                            "UniProt (SwissProt) Primary ID of Target Chain": "T_id",
                            "Target Name": "T_name",
                            "UniProt (SwissProt) Entry Name of Target Chain": "T_name_uniprot",
                            "BindingDB Target Chain Sequence": "T_seq",
                            "PubChem AID": "assay_id",
                            "Ki (nM)": "Ki",
                            "Kd (nM)": "Kd",
                            "IC50 (nM)": "IC50",
                            "EC50 (nM)": "EC50"
                           }

In [None]:
nrows = None
rawdata = pd.read_csv(os.path.join(data_path, "BindingDB_All_202404.tsv"), sep="\t", usecols=cols_required, error_bad_lines=False, nrows=nrows)[cols_required].rename(columns=dict_to_convert_colnames)

In [253]:
valid_index = rawdata[(rawdata["C_cid"].notna() & rawdata["T_id"].notna() & rawdata["IC50"].notna())].index
data_golden = rawdata.loc[valid_index]
data_golden.update(data_golden["IC50"].apply(lambda x: x.replace(">","").replace("<","") if (("<" in str(x)) | (">" in str(x))) else x))
data_golden["IC50"] = data_golden["IC50"].astype(float)
data_golden["C_cid"] = data_golden["C_cid"].astype(float).astype(int).astype(str)

In [None]:
chunk_size = 1500
chunk_index = np.split(data_golden.index, np.arange(chunk_size, len(data_golden), chunk_size))
SMILES_cols = ["C_seq_can_smiles", "C_seq_iso_smiles"]
# SMILES_container = pd.DataFrame([], columns=SMILES_cols)
# for i in tqdm(chunk_index):
for i in range(348, len(chunk_index)):
    success = False
    while success != True:
        try:
            compounds = pcp.get_compounds(list(data_golden.loc[chunk_index[i]]["C_cid"]), as_dataframe=True)
            success = True
        except Exception as e:
            print("ERROR occurred",e,"\nTry again")
    SMILES_container = pd.concat([SMILES_container, pd.DataFrame(compounds[["canonical_smiles", "isomeric_smiles"]].values, columns=["C_seq_can_smiles", "C_seq_iso_smiles"], index=chunk_index[i])], axis=0)

ERROR occurred Expecting value: line 1269784 column 13 (char 27197011) 
Try again
ERROR occurred Invalid control character at: line 1133205 column 58 (char 24161900) 
Try again
ERROR occurred Expecting value: line 904425 column 3 (char 19332691) 
Try again
ERROR occurred Expecting ',' delimiter: line 1007661 column 14 (char 21433940) 
Try again
ERROR occurred Expecting ',' delimiter: line 914214 column 12 (char 19615316) 
Try again
ERROR occurred Expecting value: line 449915 column 5 (char 9481811) 
Try again
ERROR occurred Invalid control character at: line 201505 column 136 (char 4247148) 
Try again
ERROR occurred 'PUGREST.Timeout' 
Try again
ERROR occurred 'PUGREST.Timeout' 
Try again
ERROR occurred Expecting value: line 1250911 column 4 (char 26431060) 
Try again
ERROR occurred Expecting value: line 607542 column 11 (char 12811861) 
Try again
ERROR occurred Expecting property name enclosed in double quotes: line 368637 column 9 (char 7814740) 
Try again
ERROR occurred Invalid contr

In [269]:
SMILES_container

Unnamed: 0,C_seq_can_smiles,C_seq_iso_smiles
142,CC1=NN(C=C1C2=NC3=NC=C(C(=C3N2)N4CCN(CC4)CC5=N...,CC1=NN(C=C1C2=NC3=NC=C(C(=C3N2)N4CCN(CC4)CC5=N...
144,C1CC2=C(C(=O)C1)C3(CCS(=O)(=O)C3)N=C(N2)NC4=NC...,C1CC2=C(C(=O)C1)C3(CCS(=O)(=O)C3)N=C(N2)NC4=NC...
180,CC(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2CCCCC2)NC(...,CC(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H](CN...
181,CCOC(=O)NC(C(C)C)C(=O)NC(CC1=CC=CC=C1)C(CN(CC2...,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC1=CC=CC=C...
183,CC(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2CCCCC2)NC(...,CC(C)[C@@H](C(=O)N[C@@H](CC1=CC=CC=C1)[C@H](CN...
...,...,...
2844702,CC1=C(C=C2C(=C1O)C(=O)C3=C(C2=O)C(=C(C=C3)OC)O...,CC1=C(C=C2C(=C1O)C(=O)C3=C(C2=O)C(=C(C=C3)OC)O...
2844703,CC1=C(C=C2C(=C1O)C(=O)C3=C(C2=O)C(=C(C=C3)O)O)...,CC1=C(C=C2C(=C1O)C(=O)C3=C(C2=O)C(=C(C=C3)O)O)...
2844704,C1=CC=C2C(=C1)C(=O)C3=CC(=C(C(=C3C2=O)O)COC4C(...,C1=CC=C2C(=C1)C(=O)C3=CC(=C(C(=C3C2=O)O)CO[C@H...
2844705,COC1=CC2=C(C=C1)C(=O)C3=C(C(=C(C=C3C2=O)OC4C(C...,COC1=CC2=C(C=C1)C(=O)C3=C(C(=C(C=C3C2=O)O[C@H]...


In [222]:
data_golden

Unnamed: 0,C_cid,C_sid,C_name,C_seq_lig_smiles,C_seq_inchi,T_id,T_name,T_name_uniprot,T_seq,assay_id,Ki,Kd,IC50,EC50
142,71463198,346541913,"US9447092, 3",Cc1nc(CN2CCN(CC2)c2c(Cl)cnc3[nH]c(nc23)-c2cn(C...,InChI=1S/C19H22ClN9O/c1-11-13(9-27(3)25-11)18-...,P08684,Cytochrome P450 3A4,CP3A4_HUMAN,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,aid1803425,,,50000.0,
144,44640149,8032521,"2-(benzo[d]oxazol-2-ylamino)-4',5',7,8-tetrahy...",O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,InChI=1S/C25H28Cl2O6/c1-6-7-8-9-10-17(15-11-18...,P51570,Galactokinase,GALK1_HUMAN,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,aid1803473,,,6676.9,
180,65023,8030135,"(2S)-N-[(2S,3S)-4-[(2S)-N'-(cyclohexylmethyl)-...",CC(C)[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccccc1)[C@@...,InChI=1S/C31H51N5O5/c1-20(2)28(32-22(5)37)30(4...,P12497,Dimer of Gag-Pol polyprotein [489-587],POL_HV1N5,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,aid1795218,,,8.5,
181,461984,8030127,CGP 53820 analog::CHEMBL324572::ethyl N-[(1S)-...,CCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1)[...,InChI=1S/C33H55N5O7/c1-7-44-32(42)35-28(22(3)4...,P12497,Dimer of Gag-Pol polyprotein [489-587],POL_HV1N5,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,aid1795218,,,177.0,
183,461988,8030128,"2-methoxyethyl N-[(1S)-1-{[(2S,3S)-4-[(2S)-N'-...",COCCOC(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1ccccc1...,InChI=1S/C35H59N5O9/c1-24(2)30(37-34(44)48-19-...,P12497,Dimer of Gag-Pol polyprotein [489-587],POL_HV1N5,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...,aid1795218,,,164.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96416,8853383,252626811,4-keto-5-methyl-2-[(E)-2-(4-methyl-3-nitro-phe...,Cc1c(sc2nc(C=Cc3ccc(C)c(c3)[N+]([O-])=O)[nH]c(...,InChI=1S/C17H13N3O5S/c1-8-3-4-10(7-11(8)20(24)...,P51452,Dual specificity protein phosphatase 3,DUS3_HUMAN,MSGSFELSVQDLNDLLSDGSGCYSLPSQPCNEVTPRIYVGNASVAQ...,aid1878,,,1620.0,
96417,1637842,252631968,2-[(5Z)-5-[1-[2-(3-bromoanilino)-2-keto-ethyl]...,OS(=O)(=O)CCN1C(=S)S\C(C1=O)=C1/C(=O)N(CC(=O)N...,InChI=1S/C21H16BrN3O6S3/c22-12-4-3-5-13(10-12)...,P51452,Dual specificity protein phosphatase 3,DUS3_HUMAN,MSGSFELSVQDLNDLLSDGSGCYSLPSQPCNEVTPRIYVGNASVAQ...,aid1878,,,4920.0,
96418,1328767,252626777,4-[[5-(4-bromophenyl)-2-furoyl]thiocarbamoylam...,OC(=O)c1ccc(NC(=S)NC(=O)c2ccc(o2)-c2ccc(Br)cc2...,InChI=1S/C19H13BrN2O4S/c20-13-5-1-11(2-6-13)15...,P51452,Dual specificity protein phosphatase 3,DUS3_HUMAN,MSGSFELSVQDLNDLLSDGSGCYSLPSQPCNEVTPRIYVGNASVAQ...,aid1878,,,1880.0,
96419,56642873,252631969,MLS001224314::N-[[1-(3-fluorobenzyl)indol-3-yl...,Oc1cc(O)cc(c1)C(=O)NN=Cc1cn(Cc2cccc(F)c2)c2ccc...,InChI=1S/C23H18FN3O3/c24-18-5-3-4-15(8-18)13-2...,P51452,Dual specificity protein phosphatase 3,DUS3_HUMAN,MSGSFELSVQDLNDLLSDGSGCYSLPSQPCNEVTPRIYVGNASVAQ...,aid1878,,,2080.0,
