In [1]:
import pandas as pd
import os
import sys
from pycaret.classification import *
from sklearn.model_selection import train_test_split

In [2]:
#target_id = 'CHEMBL4247'
algorithm_id = 'rf'

In [3]:
df_target = pd.read_csv('fingerprints/report.csv')
target_list = df_target.Chembl_ID.unique()
target_list = target_list.tolist()

In [4]:
#leftover = target_list[18:]

In [5]:
IC50_threshold = 50

In [6]:
def pIC50_classifier (df, th1):
    threshold1 = -np.log10(th1*(10**-9))
    pIC50_class = []
    pIC50_list = list(df['pIC50'])
    for p in pIC50_list:
        if p >= threshold1:
            pIC50_class.append(1)
        else:
            pIC50_class.append(0)
    pIC50_class_series = pd.Series(pIC50_class, name= 'pIC50_class')
    df = df.drop(['pIC50'], axis = 1)
    df_new = pd.concat([df, pIC50_class_series], axis= 1)
    return df_new

In [7]:
df_complete_tuned_report = pd.DataFrame()
for n in target_list:
  target_id = n
  df_complete = pd.read_csv(f'fingerprints/{target_id}_fingerprints.csv')
  df_complete = pIC50_classifier(df_complete, IC50_threshold)
  reg = setup(df_complete, target='pIC50_class', session_id=42, use_gpu=True, train_size=0.8, silent= True)
  model = create_model(algorithm_id)
  tuned_model = tune_model(model, n_iter = 25, search_library = 'optuna', choose_better = True, optimize='F1')
  
  path = f'models/rf_classifier_{IC50_threshold}_high/{target_id}_{algorithm_id}'
  isExist = os.path.exists(path)
  if not isExist:
    # Create a new directory because it does not exist 
    os.makedirs(path)
  os.chdir(f'models/rf_classifier_{IC50_threshold}_high/{target_id}_{algorithm_id}')

  try:
    plot_model(tuned_model, plot = 'auc', save = True)
    plot_model(tuned_model, plot = 'threshold', save = True)
    plot_model(tuned_model, plot = 'pr', save = True)
    plot_model(tuned_model, plot = 'confusion_matrix', save = True)
    plot_model(tuned_model, plot = 'error', save = True)
  except Exception:
    pass

  stdoutOrigin=sys.stdout 
  sys.stdout = open(f'hyperparameters_{algorithm_id}_{target_id}.txt', 'w')
  print(tuned_model)
  sys.stdout.close()
  sys.stdout=stdoutOrigin

  predicted = predict_model(tuned_model)
  predicted_metrics = pull()
  predicted_metrics['ChemblID'] = target_id
  predicted_metrics.to_csv(f'predicted_metrics_{IC50_threshold}_{algorithm_id}_{target_id}.csv', index = False)
  df_complete_tuned_report = pd.concat([df_complete_tuned_report, predicted_metrics])

  final_rf = finalize_model(tuned_model)
  save_model (final_rf, f'{algorithm_id}_pipeline_{target_id}')

  os.chdir('..')
  os.chdir('..')
  os.chdir('..')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7609,0.8154,0.725,0.725,0.725,0.5135,0.5135


Transformation Pipeline and Model Successfully Saved


In [8]:
df_complete_tuned_report.to_csv(f'{algorithm_id}_classifier_{IC50_threshold}_high_complete_report.csv', index=False)