In [None]:
# Import the needed libraries
import numpy as np
from PyFingerprint.fingerprint import get_fingerprint, get_fingerprints
import pandas as pd

In [None]:
# Initiate important variables
fingerprints = ['standard', 'pubchem', 'klekota-roth', 'mol2vec', 'fp3', 'fp4']
target_list = [
    'CHEMBL1957', # Insulin-like growth factor I receptor
    'CHEMBL262', # Glycogen synthase kinase-3 beta
    'CHEMBL1871', # Androgen Receptor
    'CHEMBL3717', # Hepatocyte growth factor receptor
    'CHEMBL325', # Histone deacetylase 1
    'CHEMBL3713062', # Tissue factor pathway inhibitor
    'CHEMBL2971', # Tyrosine-protein kinase JAK2 - Protein Kinase
    'CHEMBL2842', # Serine/threonine-protein kinase mTOR - Protein Kinase
    'CHEMBL1862', # Tyrosine Protein Kinase ABL - Protein Kinase
    'CHEMBL258', # Tyrosine-protein kinase LCK - Protein Kinase
    'CHEMBL4282', # Serine/threonine-protein kinase AKT - Protein Kinase
    'CHEMBL3650', # Fibroblast growth factor receptor 1 - Protein Kinase
    'CHEMBL4005', # PI3-kinase p110-alpha subunit - Transferase
    'CHEMBL3130', # PI3-kinase p110-delta subunit - Transferase
    'CHEMBL3105', # Poly [ADP-ribose] polymerase-1 - Transferase
    'CHEMBL3267', # PI3-kinase p110-gamma subunit - Transferase
    'CHEMBL3145', # PI3-kinase p110-beta subunit - Transferase
    'CHEMBL4158', # Fatty acid synthase - Transferase
    'CHEMBL220', # Acetylcholinesterase - Hydrolase
    'CHEMBL1914', # Butyrylcholinesterase - Hydrolase
    'CHEMBL2243', # Anandamide amidohydrolase - Hydrolase
    'CHEMBL4191', # Monoglyceride lipase - Hydrolase
    'CHEMBL3559', # Steryl-sulfatase - Hydrolase
    'CHEMBL5080', # Endothelial lipase - Hydrolase
    'CHEMBL217', # Dopamine D2 receptor - Family A GPR
    'CHEMBL218', # Cannabinoid CB1 receptor - Family A GPR
    'CHEMBL233', # Mu opioid receptor - Family A GPR
    'CHEMBL253', # Cannabinoid CB2 receptor - Family A GPR
    'CHEMBL224', # Serotonin 2a (5-HT2a) receptor - Family A GPR
    'CHEMBL210', # Beta-2 adrenergic receptor - Family A GPR
    'CHEMBL230', # Cyclooxygenase-2 - Oxidoreductase
    'CHEMBL1951', # Monoamine oxidase A - Oxidoreductase
    'CHEMBL4685', # Indoleamine 2,3-dioxygenase - Oxidoreductase
    'CHEMBL4235', # 11-beta-hydroxysteroid dehydrogenase 1 - Oxidoreductase
    'CHEMBL202', # Dihydrofolate reductase - Oxidoreductase
    'CHEMBL215', # Arachidonate 5-lipoxygenase - Oxidoreductase 
    'CHEMBL204', # Thrombin - Protease
    'CHEMBL4822', # Beta-secretase 1 - Protease
    'CHEMBL244', # Coagulation factor X - Protease
    'CHEMBL248', # Leukocyte elastase - Protease
    'CHEMBL332', # Matrix metalloproteinase-1 - Protease
    'CHEMBL284', # Dipeptidyl peptidase IV - Protease
    'CHEMBL2535', # Glucose transporter - Transporter
    'CHEMBL228', # Serotonin transporter - Transporter
    'CHEMBL238', # Dopamine transporter - Transporter
    'CHEMBL3884' # Sodium/glucose cotransporter 2 - Transporter
]

In [None]:
# Loop to generate fingerprints
for t in target_list:
    df = pd.read_csv(f'smiles/{t}_smiles.csv')
    smiles_list = df.canonical_smiles.to_list()
    print(f'{t} is successfully loaded')
    
    output = {}
    for f in fingerprints:
        output [f]= get_fingerprints(smiles_list, f)
    print(f'{t} fingerprints creation successful')
    
    output_np = output.copy()
    for k, fps in output.items():
        output_np[k] = np.array([fp.to_numpy() for fp in fps])
    print(f'{t} numpy array creation successful')

    for n in fingerprints:
        if n == 'mol2vec':
            sep_array = np.asarray(output_np[n], dtype='float32')
            name = [f'{n}_{i}' for i in range(len(sep_array[1]))]
            df_fp = pd.DataFrame(sep_array, columns = name)
            df_fp = pd.concat([df_fp,df.pIC50], axis=1)
            df_fp.to_csv(f'fingerprints/{t}_{n}.csv', index=False)
            print(f'{t} {n} fingerprint have been saved')
        else:            
            sep_array = np.asarray(output_np[n], dtype='int')
            name = [f'{n}_{i}' for i in range(len(sep_array[1]))]
            df_fp = pd.DataFrame(sep_array, columns = name)
            df_fp = pd.concat([df_fp,df.pIC50], axis=1)
            df_fp.to_csv(f'fingerprints/{t}_{n}.csv', index=False)
            print(f'{t} {n} fingerprint have been saved')