In [10]:
import sys
from pathlib import Path

package_path = str(Path.cwd().parent)
if package_path not in sys.path:
    sys.path.append(package_path)

import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score

from tbprop.metrics import max_f1_score
from tbprop.tree_based.utils import threshold_analysis, heatmap
from tbprop.tree_based.file_handling import read_pk_dataset, read_pk_dataset_with_split
from tbprop.tree_based.preprocessing import calc_high_corr_feats, calc_low_variance_feats, scale_columns
from tbprop.tree_based.features import morgan_fingerprint
from tbprop.tree_based.model_selection import randomized_search, CrossValidator
from tbprop.tree_based.search_spaces import hyperopt_space, random_search_space
from tbprop.tree_based.optimizers import HyperoptOptimizer, OptunaOptimizer, optimize_models
from tbprop.model_comparison import compare_models_optimizers_on_split, hypothesis_test_multiple_comparison

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray

tqdm.pandas()

In [11]:
class Args:
    def __init__(self):
        self.data_dir = "../data"
        self.dataset = "pk"
        
        prefix = f"{self.data_dir}/{self.dataset}/{self.dataset}"
        self.trn_path = prefix + "_trn.csv"
        self.val_path = prefix + "_val.csv"
        self.tst_path = prefix + "_tst.csv"
        self.full_path = prefix + "_full.csv"

        self.mode = "bin_class"

        self.test_size = 0.2
        self.cross_val_k = 5
        self.mfp_nbits = 64
        self.corr_thresh = 0.95
        self.var_thresh = 0.1
        
        self.random_state = 42
        self.random_seed = 0

args = Args()

### Read Data

In [12]:
df_base = pd.read_csv(args.full_path)
df_trn = pd.read_csv(args.trn_path)
df_val = pd.read_csv(args.val_path)
df_tst = pd.read_csv(args.tst_path)
print(df_base.shape, df_trn.shape, df_val.shape, df_tst.shape)

moe_feats = df_base.drop(['Cmpd Name'], axis=1).rename({'mol': 'smiles'}, axis=1)

df_train = moe_feats[moe_feats['smiles'].isin(df_trn['smiles'].values)]
df_valid = moe_feats[moe_feats['smiles'].isin(df_val['smiles'].values)]
df_test = moe_feats[moe_feats['smiles'].isin(df_tst['smiles'].values)]

y_trn, y_val, y_tst = df_trn['auc_bin'], df_val['auc_bin'], df_tst['auc_bin']

X_trn_p1 = df_trn.drop(['Unnamed: 0', 'auc_bin', 'smiles'], axis=1)
X_val_p1 = df_val.drop(['Unnamed: 0', 'auc_bin', 'smiles'], axis=1)
X_tst_p1 = df_tst.drop(['Unnamed: 0', 'auc_bin', 'smiles'], axis=1)

(190, 209) (114, 4178) (38, 4178) (38, 4178)


In [13]:
df_train['auc_bin'] = df_train['AUC'].apply(lambda x: x > 1000.).astype(int)
df_valid['auc_bin'] = df_valid['AUC'].apply(lambda x: x > 1000.).astype(int)
df_test['auc_bin'] = df_test['AUC'].apply(lambda x: x > 1000.).astype(int)


df_train_valid = pd.concat([df_train, df_valid]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['auc_bin'] = df_train['AUC'].apply(lambda x: x > 1000.).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['auc_bin'] = df_valid['AUC'].apply(lambda x: x > 1000.).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['auc_bin'] = df_test['AUC'].apply(lambda x:

In [14]:
model_metadata = {}

for model in ['rf', 'xgb', 'lgbm', 'cb']:
    metadata = pickle.load(open(f"{args.data_dir}/configs/best_pk_model_metadata_{model}.pkl", "rb"))
    model_metadata[model] = metadata

In [15]:
tst_smiles = df_tst['smiles'].values

In [16]:
def evaluate_and_predict(args, model_name, model_metadata, X_trn, X_val, X_tst, y_trn, y_val, y_tst, tst_smiles):
    if model_name == 'xgb':
        model_class = XGBClassifier
    elif model_name == 'rf':
        model_class = RandomForestClassifier
    elif model_name == 'lgbm':
        model_class = LGBMClassifier
    elif model_name == 'cb':
        model_class = CatBoostClassifier
    else:
        raise ValueError(f"Model name '{model_name}' not recognized!")

    model = model_class(**model_metadata[model_name]['config'])
    model.fit(pd.concat([X_trn, X_val])[model_metadata[model_name]['features']], np.concatenate([y_trn, y_val]))
    y_score = model.predict_proba(X_tst)[:,1]

    print(f"Model name = {model_metadata[model_name]['name']}")
    print(f"AUROC = {round(roc_auc_score(y_tst, y_score), 5)*100}")
    print(f"AP = {round(average_precision_score(y_tst, y_score), 5)*100}")
    print(f"Max F1 = {round(max_f1_score(y_tst, y_score), 5)*100}")

    df_preds = pd.DataFrame({'smiles': tst_smiles, 'scores': y_score})
    df_preds.to_csv(f"{args.data_dir}/predictions/pk_{model_name}_tst_preds.csv")

In [28]:
evaluate_and_predict(args, 'xgb', model_metadata, X_trn_p1, X_val_p1, X_tst_p1, y_trn, y_val, y_tst, tst_smiles)

Model name = XGBClassifier/hyperopt/P1
AUROC = 90.166
AP = 82.946
Max F1 = 88.889


In [29]:
model_metadata['lgbm']['features']

['BCUT_PEOE_1',
 'GCUT_PEOE_0',
 'GCUT_PEOE_1',
 'GCUT_SLOGP_0',
 'GCUT_SLOGP_1',
 'GCUT_SMR_0',
 'PEOE_RPC+',
 'PEOE_VSA_FHYD',
 'PEOE_VSA_FNEG',
 'PEOE_VSA_FPNEG',
 'PEOE_VSA_FPOL',
 'PEOE_VSA_FPOS',
 'petitjean',
 'petitjeanSC',
 'rsynth',
 'mfp_2',
 'mfp_3',
 'mfp_4',
 'mfp_5',
 'mfp_6',
 'mfp_9',
 'mfp_10',
 'mfp_12',
 'mfp_13',
 'mfp_14',
 'mfp_15',
 'mfp_19',
 'mfp_20',
 'mfp_21',
 'mfp_22',
 'mfp_24',
 'mfp_25',
 'mfp_26',
 'mfp_28',
 'mfp_29',
 'mfp_30',
 'mfp_31',
 'mfp_32',
 'mfp_33',
 'mfp_34',
 'mfp_35',
 'mfp_36',
 'mfp_38',
 'mfp_39',
 'mfp_41',
 'mfp_42',
 'mfp_43',
 'mfp_45',
 'mfp_46',
 'mfp_47',
 'mfp_48',
 'mfp_49',
 'mfp_50',
 'mfp_51',
 'mfp_52',
 'mfp_53',
 'mfp_54',
 'mfp_56',
 'mfp_57',
 'mfp_60',
 'mfp_61',
 'mfp_62',
 'mfp_63']

In [30]:
df_train

Unnamed: 0,smiles,AUC,apol,ast_fraglike,ast_fraglike_ext,ast_violation,ast_violation_ext,a_acc,a_acid,a_aro,...,vsa_base,vsa_don,vsa_hyd,vsa_other,vsa_pol,Weight,weinerPath,weinerPol,zagreb,auc_bin
0,Clc1c(C(=O)NNC(=O)c2oc([N+](=O)[O-])cc2)sc2c1c...,15052.0,42.364346,0,0,2,3,2,0,14,...,0.0,18.842079,163.31906,83.741920,45.975922,365.75299,1489,36,126,1
2,Clc1c(C(=O)NNC(=O)c2oc([N+](=O)[O-])cc2)sc2c1c...,2592.0,46.259930,0,0,2,3,3,0,14,...,0.0,18.842079,191.62750,83.741920,48.479675,395.77899,1872,40,136,1
3,Clc1c(C(=O)NNC(=O)c2oc([N+](=O)[O-])cc2)sc2c1c...,75175.0,43.877552,0,0,2,3,2,0,14,...,0.0,18.842079,181.61226,83.741920,45.975922,400.19800,1668,38,132,1
5,O=[N+]([O-])c1oc(C(=O)NNC(=O)c2[nH]c3c(c2)cc(-...,6.0,52.945103,0,0,2,3,2,0,20,...,0.0,24.524654,197.94620,83.741920,51.658497,390.35498,2632,43,154,0
6,Clc1ccc(Oc2cc3c([nH]c(C(=O)NNC(=O)c4oc([N+](=O...,12810.0,55.260307,0,0,2,3,2,0,20,...,0.0,24.524654,228.73935,86.245674,51.658497,440.79898,3266,45,164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,S(=O)(=O)(Nc1cc2c(C)c(C(=O)N3CC4(C3)CCSCC4)[nH...,102.0,67.802994,0,0,2,3,3,0,9,...,0.0,13.757783,307.70538,23.055140,59.341228,435.61301,2478,45,160,0
186,S(=O)(=O)(Nc1nc2[nH]ncc2cn1)CCCCF,168.0,34.402515,0,0,2,4,6,0,9,...,0.0,8.075207,126.73853,36.977745,70.298958,273.29199,669,22,90,0
187,Clc1c(C(=O)Nc2cc(c(OC)cc2)C2=CCC(C(=O)NCCC)CC2...,4559.0,66.919617,0,0,2,3,3,0,12,...,0.0,11.365152,338.28323,25.899061,41.002750,444.93399,3026,49,156,1
188,Clc1c(C(=O)Nc2cc(c(OC)cc2)C2=CCC(C(=O)NCCOCc3c...,128.0,80.948792,0,0,3,4,4,0,18,...,0.0,11.365152,418.43423,25.899061,43.506508,537.03101,5892,58,192,0


In [43]:
class FeatureEngineeringPipelinePost:

    def __init__(self, feature_set, fp_n_bits, smiles_col='SMILES', label_col='auc_bin'):
        self.feature_set = feature_set
        self.fp_n_bits = fp_n_bits
        self.smiles_col = smiles_col
        self.label_col = label_col

        self.morgan_features = [col for col in self.feature_set if col.startswith("mfp_")]
        self.moe_features = [col for col in self.feature_set if not col.startswith("mfp_")]

    def fit_transform(self, df_train, df_test):
        df_train_, df_test_ = df_train.copy().reset_index(drop=True), df_test.copy().reset_index(drop=True)
        df_train_ = morgan_fingerprint(df_train_, smiles_col=self.smiles_col, n_bits=self.fp_n_bits)
        df_test_ = morgan_fingerprint(df_test_, smiles_col=self.smiles_col, n_bits=self.fp_n_bits)

        df_train_moe, df_test_moe = df_train_[self.moe_features], df_test_[self.moe_features]
        df_train_morgan, df_test_morgan = df_train_[self.morgan_features], df_test_[self.morgan_features]

        df_train_moe_scaled, df_test_moe_scaled, scaler = scale_columns(df_train_moe, 
                                                                        df_test_moe, remove_cols=[])

        X_train = pd.concat([df_train_moe_scaled, df_train_morgan], axis=1)
        X_test = pd.concat([df_test_moe_scaled, df_test_morgan], axis=1)

        y_train = df_train_[self.label_col]
        y_test = df_test_[self.label_col]

        return X_train, X_test, y_train, y_test

In [44]:
model_metadata['rf']['config']['max_features'] = 'sqrt'

64

In [51]:
fepp_rf = FeatureEngineeringPipelinePost(model_metadata['rf']['features'], 
                                         fp_n_bits=args.mfp_nbits, 
                                         smiles_col='smiles', 
                                         label_col='auc_bin')

X_train_rf, X_test_rf, y_train_rf, y_test_rf = fepp_rf.fit_transform(df_train, df_test)

X_valid_rf, y_valid_rf = X_train_rf[-10:], y_train_rf[-10:]
X_train_rf, y_train_rf = X_train_rf[:-10], y_train_rf[:-10]


In [56]:
evaluate_and_predict(args, 'rf', model_metadata, 
                     X_train_rf, X_valid_rf, X_test_rf, 
                     y_train_rf, y_valid_rf, y_test_rf, tst_smiles)

Model name = RandomForestClassifier/optuna/P2
AUROC = 91.413
AP = 87.748
Max F1 = 90.0


In [57]:
fepp_lgbm = FeatureEngineeringPipelinePost(model_metadata['lgbm']['features'], 
                                           fp_n_bits=args.mfp_nbits, 
                                           smiles_col='smiles', 
                                           label_col='auc_bin')

X_train_lgbm, X_test_lgbm, y_train_lgbm, y_test_lgbm = fepp_lgbm.fit_transform(df_train, df_test)

X_valid_lgbm, y_valid_lgbm = X_train_lgbm[-10:], y_train_lgbm[-10:]
X_train_lgbm, y_train_lgbm = X_train_lgbm[:-10], y_train_lgbm[:-10]

evaluate_and_predict(args, 'lgbm', model_metadata, 
                     X_train_lgbm, X_valid_lgbm, X_test_lgbm, 
                     y_train_lgbm, y_valid_lgbm, y_test_lgbm, 
                     tst_smiles)

Model name = LGBMClassifier/optuna/P2
AUROC = 88.089
AP = 83.333
Max F1 = 84.444


In [63]:
skip_params = ['bayesian_matrix_reg', 'force_unit_auto_pair_weights', 'pool_metainfo_options']

model_metadata['cb']['config'] = \
    {k: v for k, v in model_metadata['cb']['config'].items() if k not in skip_params}

In [64]:
fepp_cb = FeatureEngineeringPipelinePost(model_metadata['cb']['features'], 
                                         fp_n_bits=args.mfp_nbits, 
                                         smiles_col='smiles', 
                                         label_col='auc_bin')

X_train_cb, X_test_cb, y_train_cb, y_test_cb = fepp_cb.fit_transform(df_train, df_test)

X_valid_cb, y_valid_cb = X_train_cb[-10:], y_train_cb[-10:]
X_train_cb, y_train_cb = X_train_cb[:-10], y_train_cb[:-10]

evaluate_and_predict(args, 'cb', model_metadata, 
                     X_train_cb, X_valid_cb, X_test_cb, 
                     y_train_cb, y_valid_cb, y_test_cb, 
                     tst_smiles)

0:	learn: 0.5888274	total: 117ms	remaining: 21.9s
1:	learn: 0.4913323	total: 150ms	remaining: 13.9s
2:	learn: 0.4785843	total: 150ms	remaining: 9.28s
3:	learn: 0.4104663	total: 180ms	remaining: 8.26s
4:	learn: 0.3502735	total: 210ms	remaining: 7.67s
5:	learn: 0.2993867	total: 247ms	remaining: 7.5s
6:	learn: 0.2526188	total: 281ms	remaining: 7.26s
7:	learn: 0.2096463	total: 312ms	remaining: 7.03s
8:	learn: 0.1796658	total: 351ms	remaining: 6.97s
9:	learn: 0.1554888	total: 382ms	remaining: 6.79s
10:	learn: 0.1321195	total: 413ms	remaining: 6.65s
11:	learn: 0.1115795	total: 444ms	remaining: 6.51s
12:	learn: 0.0974563	total: 476ms	remaining: 6.41s
13:	learn: 0.0886634	total: 484ms	remaining: 6.01s
14:	learn: 0.0761657	total: 511ms	remaining: 5.89s
15:	learn: 0.0693763	total: 549ms	remaining: 5.9s
16:	learn: 0.0622117	total: 562ms	remaining: 5.65s
17:	learn: 0.0549441	total: 604ms	remaining: 5.7s
18:	learn: 0.0478549	total: 633ms	remaining: 5.63s
19:	learn: 0.0420782	total: 664ms	remaining: