In [1]:
# reference: https://dataprofessor.github.io/ws/bioinformatics/cheminformatics/padelpy/scikit-learn/qsar/qspr/2021/07/06/padelpy.html#List-and-sort-fingerprint-XML-files

In [2]:
# Convert prostate cancer drug dataset to .smi file

# import pandas as pd

# df = pd.read_csv('../reference_drug/prostate_cancer_drugs_fda_approved_25aug2023.csv')
# df = pd.DataFrame({'SMILES': df['canonical_SMILES'], 'name': df['drug_name']})
# df.to_csv('molecule.smi', sep='\t', index=False, header=False)
# df
# Note: molecule.smi file rewuired manual tab sepration between SMILES and drug name!!! 

## Calculate fingerprints from SMILES using PaDELPy package
Types of fingerprints(as mentioned in RefDNN paper):
- Fingerprinter (n=1024)
- ExtendedFingerprinter (n=1024)
- GraphOnlyFingerPrinter (n=1024)

In [3]:
# Downloading the XML datafiles

# %pip install padelpy
# %pip install wget
# import wget
# import zipfile

# wget.download('https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip')
# with zipfile.ZipFile('fingerprints_xml.zip', 'r') as zip_ref:
#     zip_ref.extractall()

In [4]:
# Listing and sorting the downloaded files
import glob

xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [5]:
# Creating a list of present files

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [6]:
# Creating a data dictionary
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [7]:
# Setting up the module to calcualte the molecular fingerprint - CDK, CDKextended, CDKgraphonly

from padelpy import padeldescriptor
fingerprints = ['CDK', 'CDKextended', 'CDKgraphonly']

for fingerprint in fingerprints:
    fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
    fingerprint_descriptortypes = fp[fingerprint]
    print(fingerprint_output_file)

    padeldescriptor(mol_dir='molecule.smi', 
                    d_file=fingerprint_output_file, #'Substructure.csv'
                    #descriptortypes='SubstructureFingerprint.xml', 
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

CDK.csv


CDKextended.csv
CDKgraphonly.csv


In [8]:
# Merge all 3 fingerprint vectors
import pandas as pd

fp_1 = pd.read_csv('CDK.csv')
fp_2 = pd.read_csv('CDKextended.csv')
fp_3 = pd.read_csv('CDKgraphonly.csv')

# assuming df1, df2, df3 are your dataframes
fp_merged = pd.concat([fp_1, fp_2.iloc[:, 1:], fp_3.iloc[:, 1:]], axis=1)
fp_merged.to_csv('molecular_fingerprints.csv', index=False)
fp_merged

Unnamed: 0,Name,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,GraphFP1015,GraphFP1016,GraphFP1017,GraphFP1018,GraphFP1019,GraphFP1020,GraphFP1021,GraphFP1022,GraphFP1023,GraphFP1024
0,Abiraterone Acetate,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Apalutamide,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Bicalutamide,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cabazitaxel,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Darolutamide,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Degarelix,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,Docetaxel,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Enzalutamide,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Flutamide,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,Goserelin Acetate,0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


# Done!!!