In [None]:
# Import the needed libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from MACCS import *
from rdkit.Chem import Descriptors

In [None]:
# Protein Target List
target_id = [
    'CHEMBL1957', # Insulin-like growth factor I receptor
    'CHEMBL262', # Glycogen synthase kinase-3 beta
    'CHEMBL1871', # Androgen Receptor
    'CHEMBL3717', # Hepatocyte growth factor receptor
    'CHEMBL325', # Histone deacetylase 1
    'CHEMBL3713062', # Tissue factor pathway inhibitor
    'CHEMBL2971', # Tyrosine-protein kinase JAK2 - Protein Kinase
    'CHEMBL2842', # Serine/threonine-protein kinase mTOR - Protein Kinase
    'CHEMBL1862', # Tyrosine Protein Kinase ABL - Protein Kinase
    'CHEMBL258', # Tyrosine-protein kinase LCK - Protein Kinase
    'CHEMBL4282', # Serine/threonine-protein kinase AKT - Protein Kinase
    'CHEMBL3650', # Fibroblast growth factor receptor 1 - Protein Kinase
    'CHEMBL4005', # PI3-kinase p110-alpha subunit - Transferase
    'CHEMBL3130', # PI3-kinase p110-delta subunit - Transferase
    'CHEMBL3105', # Poly [ADP-ribose] polymerase-1 - Transferase
    'CHEMBL3267', # PI3-kinase p110-gamma subunit - Transferase
    'CHEMBL3145', # PI3-kinase p110-beta subunit - Transferase
    'CHEMBL4158', # Fatty acid synthase - Transferase
    'CHEMBL220', # Acetylcholinesterase - Hydrolase
    'CHEMBL1914', # Butyrylcholinesterase - Hydrolase
    'CHEMBL2243', # Anandamide amidohydrolase - Hydrolase
    'CHEMBL4191', # Monoglyceride lipase - Hydrolase
    'CHEMBL3559', # Steryl-sulfatase - Hydrolase
    'CHEMBL5080', # Endothelial lipase - Hydrolase
    'CHEMBL217', # Dopamine D2 receptor - Family A GPR
    'CHEMBL218', # Cannabinoid CB1 receptor - Family A GPR
    'CHEMBL233', # Mu opioid receptor - Family A GPR
    'CHEMBL253', # Cannabinoid CB2 receptor - Family A GPR
    'CHEMBL224', # Serotonin 2a (5-HT2a) receptor - Family A GPR
    'CHEMBL210', # Beta-2 adrenergic receptor - Family A GPR
    'CHEMBL230', # Cyclooxygenase-2 - Oxidoreductase
    'CHEMBL1951', # Monoamine oxidase A - Oxidoreductase
    'CHEMBL4685', # Indoleamine 2,3-dioxygenase - Oxidoreductase
    'CHEMBL4235', # 11-beta-hydroxysteroid dehydrogenase 1 - Oxidoreductase
    'CHEMBL202', # Dihydrofolate reductase - Oxidoreductase
    'CHEMBL215', # Arachidonate 5-lipoxygenase - Oxidoreductase 
    'CHEMBL204', # Thrombin - Protease
    'CHEMBL4822', # Beta-secretase 1 - Protease
    'CHEMBL244', # Coagulation factor X - Protease
    'CHEMBL248', # Leukocyte elastase - Protease
    'CHEMBL332', # Matrix metalloproteinase-1 - Protease
    'CHEMBL284', # Dipeptidyl peptidase IV - Protease
    'CHEMBL2535', # Glucose transporter - Transporter
    'CHEMBL228', # Serotonin transporter - Transporter
    'CHEMBL238', # Dopamine transporter - Transporter
    'CHEMBL3884' # Sodium/glucose cotransporter 2 - Transporter
    ]

In [None]:
# Initiate important functions and variables
def molsmile (smiles):
    moldata = []
    for m in smiles: # iterate through iterables in smiles
        mol = Chem.MolFromSmiles(m) # convert smiles to mols
        moldata.append(mol)
    return moldata
    
radius = [1,2,3]
bit_num = [512,1024,2048]
combined = [(r,b) for r in radius for b in bit_num]


In [None]:
# Generate Atom Pair, ECFP, MACCS, RDKit, and Topological Torsions
for t in target_id:
    df = pd.read_csv(f'smiles/{t}_smiles.csv') # read smiles
    smiles_list = df.canonical_smiles.to_list() # turn smiles to list
    mol_list = molsmile(smiles_list) # turn list to mol data
    # Atom Pair Generator
    fpgen = AllChem.GetAtomPairGenerator()
    pairfps = [fpgen.GetFingerprint(x) for x in mol_list]
    pairfps_lists = [list(l) for l in pairfps]
    pairfps_name = [f'pairfps_bit_{i}' for i in range(len(pairfps[1]))] 
    pairfps_df = pd.DataFrame(pairfps_lists, columns=pairfps_name)
    pairfps_df = pd.concat([pairfps_df,df.pIC50], axis=1)
    pairfps_df.to_csv(f'fingerprints/{t}_pairfps.csv', index=False)
    # ECFP Generator
    for r, b, in combined:
        fingerprint = [AllChem.GetMorganFingerprintAsBitVect(x,r, nBits = b) for x in mol_list] # Create efcp6 fingerprints
        fingerprint_lists = [list(l) for l in fingerprint] # turn fingerprints to list
        fingerprint_name = [f'ecfp_bit_{i}' for i in range(len(fingerprint[1]))] # create bit name
        fingerprint_df = pd.DataFrame(fingerprint_lists, columns=fingerprint_name) # create efcp dataframe
        fingerprint_df = pd.concat([fingerprint_df,df.pIC50], axis=1)
        rad_name = r*2
        fingerprint_df.to_csv(f'fingerprints/{t}_ECFP{rad_name}_{b}.csv', index=False)
    # MACCS Generator
    maccs_descriptor = MACCS(df['canonical_smiles'])   
    maccs_descriptor.compute_MACCS(f'{t}') 
    df1 = pd.read_csv(f'{t}_MACCS.csv')
    df2 = df.merge(df1, how='inner', left_on='canonical_smiles', right_on='smiles')
    df2.drop(['molecule_chembl_id','smiles'], axis=1, inplace=True)
    df2.to_csv(f'fingerprints/{t}_MACCS.csv', index=False)
    # RDKit Fingerprint Generator
    fpgen = AllChem.GetRDKitFPGenerator()
    rdk = [fpgen.GetFingerprint(x) for x in mol_list]
    rdk_lists = [list(l) for l in rdk]
    rdk_name = [f'rdk_bit_{i}' for i in range(len(rdk[1]))] 
    rdk_df = pd.DataFrame(rdk_lists, columns=rdk_name)
    rdk_df = pd.concat([rdk_df,df.pIC50], axis=1)
    rdk_df.to_csv(f'fingerprints/{t}_rdk.csv', index=False)
    # Topological Torsion Generator
    fpgen = AllChem.GetTopologicalTorsionGenerator()
    tts = [fpgen.GetFingerprint(x) for x in mol_list]
    tts_lists = [list(l) for l in tts]
    tts_name = [f'tts_bit_{i}' for i in range(len(tts[1]))] 
    tts_df = pd.DataFrame(tts_lists, columns=tts_name)
    tts_df = pd.concat([tts_df,df.pIC50], axis=1)
    tts_df.to_csv(f'fingerprints/{t}_tts.csv', index=False)
