In [1]:
import os
import mlflow
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import pandas as pd

%matplotlib inline

#alt.renderers.enable("html")
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
tracking_uri = os.getenv("TRACKING_URI", default="http://localhost:5000")
mlflow.set_tracking_uri(tracking_uri)

## Experiment Parameter-Search Gradient Boosting Decision Tree (XGBoost)

- Single random seed for model training for all models
- Single random seed for random split for all models
- Per run/model calculate atomic attribution using various methods (e.g. feature importances, SHAP)
- Single target objective (first)
- Hyperparameter search using optuna tpe sampler (Tree-structured Parzen Estimator - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.samplers.TPESampler.html)

In [3]:
experiment_name = "herg_gbdt_opttpe1"
exp = mlflow.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(experiment_ids=exp.experiment_id)

In [None]:
#runs.iloc[1:].describe().T

In [None]:
#runs.iloc[1:].corr()

### Search space

In [4]:
search_space = [c for c in runs.columns if "search_space/" in c]
for r in runs[search_space].iloc[0].head(n=99):
    print(r)

{'name': 'max_depth', 'type': 'choice', 'values': [None, 4, 8, 12, 16, 20, 24, 28, 32, 36]}
{'name': 'subsample', 'type': 'choice', 'values': [0.6, 0.8, 1.0]}
{'name': 'min_child_weight', 'type': 'choice', 'values': [1, 5, 10]}
{'name': 'gamma', 'type': 'choice', 'values': [0.5, 1, 1.5, 2, 5]}
{'name': 'learning_rate', 'type': 'choice', 'values': [0.3, 0.1, 0.05, 0.01]}
{'name': 'max_delta_step', 'type': 'choice', 'values': [0, 0.1, 1, 10]}
{'name': 'n_estimators', 'type': 'choice', 'values': [10, 50, 100, 200, 500]}
{'name': 'colsample_bytree', 'type': 'choice', 'values': [0.6, 0.8, 1.0]}


### Other + default parameters

In [5]:
params = [c for c in runs.columns if "search_space" not in c and "params.args" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

batch_size: 9999
standardize: False
track_metrics: ...value too long for mlflow - not inserted
pruner_name: None
tracking_uri: http://localhost:5000
sampler_name: tpe
gbdt_kwargs: {'n_estimators': 100, 'learning_rate': 0.3, 'gamma': 0.0, 'max_depth': 6, 'min_child_weight': 1, 'max_delta_step': 0, 'colsample_bytree': 0.3}
patience: 20
cache_dir: ../../../data/herg/
run_name: tpe
split_size: (0.6, 0.2, 0.2)
attribution_kwargs: ...value too long for mlflow - not inserted
split_type: random
trials: 30
seed: 23801851
num_workers: 0
featurizer_kwargs: {'fold': 1024, 'radius': 3, 'return_count': True, 'use_chirality': True, 'use_features': True}
experiment_name: herg_gbdt_opttpe1
featurizer_mp_context: fork
use_labels: ['active_g10']
split_seed: 2208715393
featurizer_name: combined
minimize: False
objective_name: val/AUROC
featurizer_chunksize: 100


### Best parameters
- Found within this scenario

In [6]:
params = [c for c in runs.columns if c and "params.best" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

params.best/max_depth: 24
params.best/n_estimators: 200
params.best/learning_rate: 0.1
params.best/subsample: 1.0
params.best/min_child_weight: 1
params.best/colsample_bytree: 0.6
params.best/max_delta_step: 10
params.best/gamma: 2


### Metrics
- Include metrics for atomic attribution/weights
    - Values were calculated using active hergophores on all rows for which herg activity was predicted as negative
    - Threshold was calculated per individual model using threshold moving

In [7]:
metrics_names = [
    "metrics.test/AUROC",
    "metrics.test/loss",
    "metrics.test/mean/avg_score_pred_inactive/feature_importances"
    "metrics.test/mean/avg_score_pred_inactive/shap",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns].iloc[1:]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AUROC,30.0,0.876159,0.032728,0.795935,0.853728,0.892334,0.899286,0.906218
shap,30.0,0.601373,0.018245,0.562026,0.589346,0.597426,0.616004,0.637615
feature_importances,30.0,0.463824,0.061023,0.399148,0.419927,0.43649,0.481979,0.598467


#### Correlation

- Correlation between various metrics

In [8]:
metrics.corr(method="pearson")

Unnamed: 0,feature_importances,AUROC,shap
feature_importances,1.0,-0.903591,0.667743
AUROC,-0.903591,1.0,-0.732635
shap,0.667743,-0.732635,1.0


In [9]:
x = "AUROC"
y = "feature_importances"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart


In [10]:

x = "AUROC"
y = "shap"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart

## Experiment Best GBDT (kfold cross validated)

- Using best parameters found (see above)
    - Run 5-fold cross validation (was run 4 times)
    - Results/metrics were calculated using the mean of all individual runs (4 * 5 = 20 runs)
    - Per 5-fold cross validation (4 times) the seed for model training and splitting was chosen randomly

In [14]:
experiment_name = "herg_gbdt_best_kfold"
exp = mlflow.get_experiment_by_name(experiment_name)

query = "tags.mlflow.runName != 'summary'"
runs = mlflow.search_runs(experiment_ids=exp.experiment_id, filter_string=query)

### Seeds


In [15]:
runs[["run_id", "params.seed", "params.split_seed"]].head(n=99)

Unnamed: 0,run_id,params.seed,params.split_seed
0,7279eab9ddf1475982ff06882ed42268,3016751081,3016751081
1,7517a8a5577c48aca7a7307b5f62b6fa,3016751081,3016751081
2,c1fe5e6124ec4e36b609c2fd6ffa3b74,3016751081,3016751081
3,e6a53bdced454bfcbde1d333d0ef8728,3016751081,3016751081
4,1560758e84f145acaaec129ef0882929,3016751081,3016751081
5,c7300947941f4d8491c7cfb14b8c85f6,310272878,310272878
6,49b048599bd047898de8728461b93f59,310272878,310272878
7,79738e65d0724fa78213715389ad0fd8,310272878,310272878
8,ae6480667ace4684beee3d894d7b6769,310272878,310272878
9,f9272437aa214f958fceda8f83558120,310272878,310272878


### Parameters

In [17]:
params = [c for c in runs.columns if "search_space" not in c and "params" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.', '')}: {v}")

pos_label: 1
train/threshold-t0: 0.5387276411056519
gbdt_kwargs/verbosity: 1
colsample_bylevel: None
gbdt_fit_kwargs/early_stopping_rounds: 10
gbdt_kwargs/gpu_id: None
random_state: 3016751081
scale_pos_weight: None
use_label_encoder: True
gpu_id: None
reg_alpha: None
learning_rate: 0.1
gbdt_kwargs/n_jobs: 16
missing: nan
base_score: None
min_child_weight: 1
max_depth: 24
smile1: c1ccccc1CNCC - active
smile4: c1ccccc1Cc1ccccc1 - active
seed: 3016751081
featurizer_kwargs/radius: 3
booster: None
use_cache: True
gbdt_fit_kwargs/eval_metric: logloss
gamma: 2.0
monotone_constraints: None
standardize: True
featurizer_kwargs/use_features: True
n_estimators: 200
smile2: c1ccccc1CCNC - active
split_type: random_kfold
gbdt_kwargs/max_delta_step: 10
featurizer_n_jobs: 0
split_size: (5, 4, 0)
num_workers: 0
reg_lambda: None
importance_type: gain
interaction_constraints: None
gbdt_kwargs/random_state: 3016751081
gbdt_kwargs/colsample_bytree: 0.6
gbdt_kwargs/min_child_weight: 1
featurizer_kwargs/use

### Metrics

#### Performance on hERG dataset

In [18]:
metrics_names = [
    "metrics.test/AUROC",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AUROC,10.0,0.890406,0.009438,0.876794,0.885907,0.889606,0.898113,0.903912


#### Active Hergophores on Inactive Model Predictions

- Atomic attribution using various methods

In [20]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_inactive/shap",
    "metrics.test/mean/avg_score_pred_inactive/feature_importances",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
shap,10.0,0.56757,0.010141,0.553274,0.559803,0.567131,0.573977,0.584852
feature_importances,9.0,0.426215,0.017189,0.398139,0.41213,0.43184,0.439016,0.44762


#### Active Hergophores on Active Model Predictions

- Atomic attribution using various methods

In [22]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_active/shap",
    "metrics.test/mean/avg_score_pred_active/feature_importances",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
shap,10.0,0.614614,0.012686,0.599142,0.602941,0.612316,0.624706,0.632554
feature_importances,9.0,0.472875,0.017103,0.431461,0.472509,0.477114,0.480586,0.48999


#### Correlations

In [23]:
metrics_names = [
    "metrics.test/AUROC",
]

metrics_names += [
    "metrics.test/mean/avg_score_pred_inactive/shap",
    "metrics.test/mean/avg_score_pred_inactive/feature_importances",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.corr(method="pearson")

Unnamed: 0,AUROC,shap,feature_importances
AUROC,1.0,-0.203103,0.316412
shap,-0.203103,1.0,0.25932
feature_importances,0.316412,0.25932,1.0
