In [1]:
# reference: https://dataprofessor.github.io/ws/bioinformatics/cheminformatics/padelpy/scikit-learn/qsar/qspr/2021/07/06/padelpy.html#List-and-sort-fingerprint-XML-files

In [2]:
# Convert prostate cancer drug dataset to .smi file

import pandas as pd

# df = pd.read_csv('../reference_drug/prostate_cancer_drugs_fda_approved_25aug2023.csv')
df = pd.read_csv('GDSC2_fitted_dose_response_27Oct23.csv')
# df = pd.DataFrame({'SMILES': df['canonical_SMILES'], 'name': df['drug_name']})
# df.to_csv('molecule.smi', sep='\t', index=False, header=False)
df
# Note: molecule.smi file required manual tab sepration between SMILES and drug name!!! 

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-1.463887,0.930220,0.089052,0.433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-4.869455,0.614970,0.111351,-1.421100
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.360586,0.791072,0.142855,-0.599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-5.044940,0.592660,0.135539,-1.516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.000100,0.1,-3.741991,0.734047,0.128059,-0.807232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242031,GDSC2,343,16188242,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.127082,0.976746,0.074498,0.156872
242032,GDSC2,343,16188695,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.576377,0.913378,0.057821,-1.626959
242033,GDSC2,343,16188953,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.519636,0.975001,0.058090,0.608442
242034,GDSC2,343,16189493,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.694579,0.969969,0.101013,0.809684


## Calculate fingerprints from SMILES using PaDELPy package
Types of fingerprints(as mentioned in RefDNN paper):
- Fingerprinter (n=1024)
- ExtendedFingerprinter (n=1024)
- GraphOnlyFingerPrinter (n=1024)

In [3]:
# Downloading the XML datafiles

# %pip install padelpy
# %pip install wget
# import wget
# import zipfile

# wget.download('https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip')
# with zipfile.ZipFile('fingerprints_xml.zip', 'r') as zip_ref:
#     zip_ref.extractall()

In [4]:
# Listing and sorting the downloaded files
import glob

xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [5]:
# Creating a list of present files

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [6]:
# Creating a data dictionary
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [7]:
# Setting up the module to calcualte the molecular fingerprint - CDK, CDKextended, CDKgraphonly

from padelpy import padeldescriptor
fingerprints = ['CDK', 'CDKextended', 'CDKgraphonly']

for fingerprint in fingerprints:
    fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
    fingerprint_descriptortypes = fp[fingerprint]
    print(fingerprint_output_file)

    padeldescriptor(mol_dir='molecule.smi', 
                    d_file=fingerprint_output_file, #'Substructure.csv'
                    #descriptortypes='SubstructureFingerprint.xml', 
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

CDK.csv


CDKextended.csv
CDKgraphonly.csv


In [8]:
# Merge all 3 fingerprint vectors
import pandas as pd

fp_1 = pd.read_csv('CDK.csv')
fp_2 = pd.read_csv('CDKextended.csv')
fp_3 = pd.read_csv('CDKgraphonly.csv')

# assuming df1, df2, df3 are your dataframes
fp_merged = pd.concat([fp_1, fp_2.iloc[:, 1:], fp_3.iloc[:, 1:]], axis=1)
fp_merged.to_csv('molecular_fingerprints.csv', index=False)
fp_merged

Unnamed: 0,Name,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,GraphFP1015,GraphFP1016,GraphFP1017,GraphFP1018,GraphFP1019,GraphFP1020,GraphFP1021,GraphFP1022,GraphFP1023,GraphFP1024
0,Abiraterone Acetate,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Apalutamide,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Bicalutamide,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cabazitaxel,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Darolutamide,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Degarelix,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,Docetaxel,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Enzalutamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Flutamide,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,Goserelin Acetate,0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


# Done!!!