In [None]:
# Import the needed libraries
import pandas as pd
from pycaret.regression import *

In [None]:
# Initiate important variables
target_id = [
    'CHEMBL1957', # Insulin-like growth factor I receptor
    'CHEMBL262', # Glycogen synthase kinase-3 beta
    'CHEMBL1871', # Androgen Receptor
    'CHEMBL3717', # Hepatocyte growth factor receptor
    'CHEMBL325', # Histone deacetylase 1
    'CHEMBL3713062', # Tissue factor pathway inhibitor
    'CHEMBL2971', # Tyrosine-protein kinase JAK2 - Protein Kinase
    'CHEMBL2842', # Serine/threonine-protein kinase mTOR - Protein Kinase
    'CHEMBL1862', # Tyrosine Protein Kinase ABL - Protein Kinase
    'CHEMBL258', # Tyrosine-protein kinase LCK - Protein Kinase
    'CHEMBL4282', # Serine/threonine-protein kinase AKT - Protein Kinase
    'CHEMBL3650', # Fibroblast growth factor receptor 1 - Protein Kinase
    'CHEMBL4005', # PI3-kinase p110-alpha subunit - Transferase
    'CHEMBL3130', # PI3-kinase p110-delta subunit - Transferase
    'CHEMBL3105', # Poly [ADP-ribose] polymerase-1 - Transferase
    'CHEMBL3267', # PI3-kinase p110-gamma subunit - Transferase
    'CHEMBL3145', # PI3-kinase p110-beta subunit - Transferase
    'CHEMBL4158', # Fatty acid synthase - Transferase
    'CHEMBL220', # Acetylcholinesterase - Hydrolase
    'CHEMBL1914', # Butyrylcholinesterase - Hydrolase
    'CHEMBL2243', # Anandamide amidohydrolase - Hydrolase
    'CHEMBL4191', # Monoglyceride lipase - Hydrolase
    'CHEMBL3559', # Steryl-sulfatase - Hydrolase
    'CHEMBL5080', # Endothelial lipase - Hydrolase
    'CHEMBL217', # Dopamine D2 receptor - Family A GPR
    'CHEMBL218', # Cannabinoid CB1 receptor - Family A GPR
    'CHEMBL233', # Mu opioid receptor - Family A GPR
    'CHEMBL253', # Cannabinoid CB2 receptor - Family A GPR
    'CHEMBL224', # Serotonin 2a (5-HT2a) receptor - Family A GPR
    'CHEMBL210', # Beta-2 adrenergic receptor - Family A GPR
    'CHEMBL230', # Cyclooxygenase-2 - Oxidoreductase
    'CHEMBL1951', # Monoamine oxidase A - Oxidoreductase
    'CHEMBL4685', # Indoleamine 2,3-dioxygenase - Oxidoreductase
    'CHEMBL4235', # 11-beta-hydroxysteroid dehydrogenase 1 - Oxidoreductase
    'CHEMBL202', # Dihydrofolate reductase - Oxidoreductase
    'CHEMBL215', # Arachidonate 5-lipoxygenase - Oxidoreductase 
    'CHEMBL204', # Thrombin - Protease
    'CHEMBL4822', # Beta-secretase 1 - Protease
    'CHEMBL244', # Coagulation factor X - Protease
    'CHEMBL248', # Leukocyte elastase - Protease
    'CHEMBL332', # Matrix metalloproteinase-1 - Protease
    'CHEMBL284', # Dipeptidyl peptidase IV - Protease
    'CHEMBL2535', # Glucose transporter - Transporter
    'CHEMBL228', # Serotonin transporter - Transporter
    'CHEMBL238', # Dopamine transporter - Transporter
    'CHEMBL3884' # Sodium/glucose cotransporter 2 - Transporter
]
fingerprint_list = [
    'ECFP2_512',
    'ECFP2_1024',
    'ECFP2_2048',
    'ECFP4_512',
    'ECFP4_1024',
    'ECFP4_2048',
    'ECFP6_512',
    'ECFP6_1024',
    'ECFP6_2048',
    'MACCS',
    'pairfps',
    'rdk',
    'tts',
    'standard',
    'fp3',
    'fp4',
    'klekota-roth',
    'mol2vec',
    'pubchem'
    ]

In [None]:
# Looping model creation and pulling the metrics into a dataframe
df_complete = pd.DataFrame()
for x in target_id:
    for y in fingerprint_list:
        df = pd.read_csv(f'fingerprints\{x}_{y}.csv')
        reg = setup(df, target='pIC50', session_id=42, use_gpu=True, train_size=0.7, normalize=True, normalize_method='minmax', silent=True)
        models = compare_models(['lightgbm', 'rf', 'br','knn','xgboost'])
        df1 = pull(models)
        df1['fingerprint'] = f'{y}'
        df1['CHEMBLID'] = f'{x}'
        df_complete = pd.concat([df_complete, df1])
        print(f'{x} {y} has been completed.')
df_complete.to_csv(f'complete_comparison.csv', index=False)