In [1]:
import os
import mlflow
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import pandas as pd

%matplotlib inline

#alt.renderers.enable("html")
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
tracking_uri = os.getenv("TRACKING_URI", default="http://localhost:5000")
mlflow.set_tracking_uri(tracking_uri)

## Experiment Parameter-Search Random Forest

- Single random seed for model training for all models
- Single random seed for random split for all models
- Per run/model calculate atomic attribution using various methods (e.g. treeinterpreter, global impurity feature importance for rf etc.)
- Single target objective (first)
- Hyperparameter search using optuna tpe sampler (Tree-structured Parzen Estimator - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.samplers.TPESampler.html)

In [3]:
experiment_name = "herg_rf_opttpe1"
exp = mlflow.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(experiment_ids=exp.experiment_id)

In [None]:
#runs.iloc[1:].describe().T

In [None]:
#runs.iloc[1:].corr()

### Search space

In [4]:
search_space = [c for c in runs.columns if "search_space/" in c]
for r in runs[search_space].iloc[0].head(n=99):
    print(r)

{'name': 'n_estimators', 'type': 'choice', 'values': [10, 50, 100, 200, 500]}
{'name': 'min_samples_leaf', 'type': 'choice', 'values': [1, 2, 4]}
{'name': 'bootstrap', 'type': 'choice', 'values': [True, False]}
{'name': 'max_features', 'type': 'choice', 'values': ['auto', 'sqrt']}
{'name': 'max_depth', 'type': 'choice', 'values': [None, 10, 20, 30, 40, 50, 60, 70]}
{'name': 'criterion', 'type': 'choice', 'values': ['gini', 'entropy']}
{'name': 'min_samples_split', 'type': 'choice', 'values': [2, 5, 10]}


### Other + default parameters

In [22]:
params = [c for c in runs.columns if "search_space" not in c and "params.args" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

### Best parameters
- Found within this scenario

In [6]:
params = [c for c in runs.columns if c and "params.best" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

params.best/bootstrap: False
params.best/n_estimators: 200
params.best/criterion: entropy
params.best/min_samples_split: 5
params.best/min_samples_leaf: 2
params.best/max_features: sqrt
params.best/max_depth: 40


### Metrics
- Include metrics for atomic attribution/weights
    - Values were calculated using active hergophores on all rows for which herg activity was predicted as negative
    - Threshold was calculated per individual model using threshold moving

In [17]:
metrics_names = [
    "metrics.test/AUROC",
    "metrics.test/loss",
    "metrics.test/mean/avg_score_pred_inactive/impurity"
    "metrics.test/mean/avg_score_pred_inactive/treeinterpreter",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns].iloc[1:]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AUROC,19.0,0.895086,0.008642,0.879861,0.891096,0.894983,0.898915,0.915833
impurity,19.0,0.73392,0.008957,0.717656,0.729059,0.735102,0.736916,0.75073
treeinterpreter,19.0,0.382229,0.010059,0.361666,0.377456,0.382633,0.386903,0.405098


#### Correlation

- Correlation between various metrics

In [8]:
metrics.corr(method="pearson")

Unnamed: 0,impurity,AUROC,treeinterpreter
impurity,1.0,0.683556,-0.151315
AUROC,0.683556,1.0,-0.223517
treeinterpreter,-0.151315,-0.223517,1.0


In [9]:
x = "AUROC"
y = "impurity"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart


## Experiment Best RF (kfold cross validated)

- Using best parameters found (see above)
    - Run 5-fold cross validation (was run 4 times)
    - Results/metrics were calculated using the mean of all individual runs (4 * 5 = 20 runs)
    - Per 5-fold cross validation (4 times) the seed for model training and splitting was chosen randomly

In [10]:
experiment_name = "herg_rf_best_kfold"
exp = mlflow.get_experiment_by_name(experiment_name)

query = "tags.mlflow.runName != 'summary'"
runs = mlflow.search_runs(experiment_ids=exp.experiment_id, filter_string=query)

### Seeds


In [11]:
runs[["run_id", "params.seed", "params.split_seed"]].head(n=99)

Unnamed: 0,run_id,params.seed,params.split_seed
0,83db45405b704c49b0b4a0d12bc10994,166806334,166806334
1,46d70c678c0049a084daa24f014862ec,166806334,166806334
2,c895e7c45d49417bae414cfc004a9f9c,166806334,166806334
3,6fef517d4ec443fe8f7d1adf775aca40,166806334,166806334
4,9a85521df4c34bc0a691b83f8877869d,166806334,166806334
5,b45a06c046a64c9591be78f5c498a53c,3369474970,3369474970
6,5c80aafa8c134b1f8d937a5e6041bb1b,3369474970,3369474970
7,37dccf6ad8824c5982e4c32b69b2427d,3369474970,3369474970
8,319f95721a1b40ab99d305db2e8560ec,3369474970,3369474970
9,73a8a3c35f21424d85101d02aab49cd3,3369474970,3369474970


### Parameters

In [12]:
params = [c for c in runs.columns if "search_space" not in c and "params" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.', '')}: {v}")

split_type: random_kfold
min_weight_fraction_leaf: 0.0
verbose: 1
train/threshold-t0: 0.5100021958351135
featurizer_kwargs/use_features: True
criterion: entropy
max_samples: None
random_state: 166806334
input_size: 2017
ignore_index: -100
bootstrap: False
num_classes: 2
rf_kwargs/max_features: sqrt
featurizer_kwargs/fold: 1024
class_weight: None
smile2: c1ccccc1CCNC - active
rf_kwargs/n_estimators: 200
featurizer_kwargs/radius: 3
smile4: c1ccccc1Cc1ccccc1 - active
ccp_alpha: 0.0
featurizer_name: combined
n_estimators: 200
smile1: c1ccccc1CNCC - active
standardize: True
use_cache: True
split_size: (5, 4, 0)
num_workers: 0
min_impurity_decrease: 0.0
featurizer_kwargs/return_count: True
featurizer_chunksize: 100
smile0: CCOc1ccccc1 - active
featurizer_mp_context: fork
seed: 166806334
min_samples_split: 5
objective: binary
split_seed: 166806334
smile3: c1ccccc1CN2CCCCC2 - active
min_impurity_split: None
rf_kwargs/bootstrap: False
cache_dir: ../../../data/herg/
num_targets: 1
rf_kwargs/max_

### Metrics

#### Performance on hERG dataset

In [18]:
metrics_names = [
    "metrics.test/AUROC",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AUROC,20.0,0.894575,0.008716,0.879861,0.890425,0.894838,0.897595,0.915833


#### Active Hergophores on Inactive Model Predictions

- Atomic attribution using various methods

In [19]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_inactive/impurity",
    "metrics.test/mean/avg_score_pred_inactive/input_x_impurity",
    "metrics.test/mean/avg_score_pred_inactive/occlusion",
    "metrics.test/mean/avg_score_pred_inactive/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_inactive/treeinterpreter",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
impurity,20.0,0.733484,0.008934,0.717656,0.727513,0.734032,0.736737,0.75073
shapley_value_sampling,5.0,0.584729,0.013353,0.564401,0.577842,0.591943,0.594215,0.595243
occlusion,20.0,0.558192,0.016911,0.514387,0.549372,0.562734,0.570205,0.580939
input_x_impurity,20.0,0.39679,0.00801,0.37961,0.393722,0.397703,0.399177,0.415246
treeinterpreter,20.0,0.382882,0.010217,0.361666,0.378061,0.383465,0.388164,0.405098


#### Active Hergophores on Active Model Predictions

- Atomic attribution using various methods

In [20]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_active/impurity",
    "metrics.test/mean/avg_score_pred_active/input_x_impurity",
    "metrics.test/mean/avg_score_pred_active/occlusion",
    "metrics.test/mean/avg_score_pred_active/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_active/treeinterpreter",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T.sort_values(by="mean", ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
impurity,20.0,0.714395,0.006987,0.699044,0.711387,0.715798,0.719995,0.723036
treeinterpreter,20.0,0.62582,0.008989,0.608478,0.621473,0.625606,0.631745,0.641714
shapley_value_sampling,5.0,0.597837,0.020107,0.568877,0.588296,0.59988,0.613007,0.619126
occlusion,20.0,0.552449,0.012218,0.529969,0.543285,0.553669,0.564015,0.572107
input_x_impurity,20.0,0.457708,0.010017,0.437521,0.453137,0.460364,0.464311,0.472306


#### Correlations

In [16]:
metrics_names = [
    "metrics.test/AUROC",
]

metrics_names += [
    "metrics.test/mean/avg_score_pred_inactive/impurity",
    "metrics.test/mean/avg_score_pred_inactive/input_x_impurity",
    "metrics.test/mean/avg_score_pred_inactive/occlusion",
    "metrics.test/mean/avg_score_pred_inactive/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_inactive/treeinterpreter",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.corr(method="pearson")

Unnamed: 0,input_x_impurity,AUROC,impurity,treeinterpreter,occlusion,shapley_value_sampling
input_x_impurity,1.0,-0.17963,-0.194658,0.063378,-0.001497,-0.604746
AUROC,-0.17963,1.0,0.196513,0.221176,-0.332502,-0.525221
impurity,-0.194658,0.196513,1.0,-0.377372,0.033097,0.128604
treeinterpreter,0.063378,0.221176,-0.377372,1.0,-0.565178,-0.774499
occlusion,-0.001497,-0.332502,0.033097,-0.565178,1.0,0.754203
shapley_value_sampling,-0.604746,-0.525221,0.128604,-0.774499,0.754203,1.0
