In [1]:
import os
import mlflow
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import pandas as pd

%matplotlib inline

#alt.renderers.enable("html")
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
tracking_uri = os.getenv("TRACKING_URI", default="http://localhost:5000")
mlflow.set_tracking_uri(tracking_uri)

## Experiment Parameter-Search TabNet

- Single random seed for model training for all models
- Single random seed for random split for all models
- Per run/model calculate atomic attribution using various methods (e.g. tabnet, integrated gradients, saliency, etc.)
- Single target objective (first) using BCE-loss
- Hyperparameter search using optuna tpe sampler (Tree-structured Parzen Estimator - https://optuna.readthedocs.io/en/stable/reference/generated/optuna.samplers.TPESampler.html)

In [3]:
experiment_name = "herg_tn_opttpe1"
exp = mlflow.get_experiment_by_name(experiment_name)

runs = mlflow.search_runs(experiment_ids=exp.experiment_id)

In [4]:
#runs.iloc[1:].describe().T

In [5]:
#runs.iloc[1:].corr()

### Search space

In [6]:
search_space = [c for c in runs.columns if "search_space/" in c]
for r in runs[search_space].iloc[0].head(n=99):
    print(r)

{'name': 'lambda_sparse', 'type': 'choice', 'values': [0.0, 1e-06, 0.0001, 0.01]}
{'name': 'decay_rate', 'type': 'choice', 'values': [0.8, 0.9, 0.95]}
{'name': 'virtual_batch_size', 'type': 'choice', 'values': [16, 32, 64]}
{'name': 'decision_size', 'type': 'choice', 'values': [16, 24, 32, 64]}
{'name': 'decay_step', 'type': 'choice', 'values': [50, 200, 800]}
{'name': 'momentum', 'type': 'choice', 'values': [0.1, 0.05, 0.02]}
{'name': 'gamma', 'type': 'choice', 'values': [1.0, 1.2, 1.5]}
{'name': 'nr_steps', 'type': 'choice', 'values': [3, 4, 5, 7]}
{'name': 'lr', 'type': 'choice', 'values': [0.04, 0.02, 0.01]}


### Other + default parameters

(parameter in the search space are overwritten)

In [7]:
params = [c for c in runs.columns if "search_space" not in c and "params.args" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

feature_size: 128
scheduler: exponential_decay
relaxation_type: gamma_fixed
minimize: False
virtual_batch_size: 256
optimizer: adamw
batch_size: 256
lr: 0.01
patience: 10
featurizer_mp_context: fork
experiment_name: herg_tn_opttpe1
decision_size: 64
split_seed: 1362625464
stochastic_weight_avg: False
run_name: tpe
cache_dir: ../../../data/herg/
checkpoint_objective: val/loss
nr_layers: 4
nr_shared_layers: 2
featurizer_name: combined
standardize: False
normalize_input: True
scheduler_params: {'decay_step': 10, 'decay_rate': 0.95}
featurizer_kwargs: {'fold': 1024, 'radius': 3, 'return_count': True, 'use_chirality': True, 'use_features': True}
split_size: (0.6, 0.2, 0.2)
featurizer_chunksize: 100
pruner_name: None
tracking_uri: http://localhost:5000
alpha: 2.0
use_labels: ['active_g10']
seed: 1073978726
objective_name: val/AUROC
track_metrics: ...value too long for mlflow - not inserted
attribution_kwargs: ...value too long for mlflow - not inserted
gamma: 1.5
lambda_sparse: 0.0
optimizer

### Best parameters
- Best parameters found within this scenario for each component in the search space

In [8]:
params = [c for c in runs.columns if c and "params.best" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.args/', '')}: {v}")

params.best/lr: 0.01
params.best/decay_step: 800
params.best/nr_steps: 3
params.best/momentum: 0.05
params.best/virtual_batch_size: 64
params.best/decay_rate: 0.9
params.best/gamma: 1.5
params.best/decision_size: 16
params.best/lambda_sparse: 0.0001


### Metrics
- Include metrics for atomic attribution/weights
    - Values were calculated using active hergophores on all rows for which herg activity was predicted as negative
    - Threshold was calculated per individual model using threshold moving
    
- Attribution Methods:
    - Tabnet - using the aggregated masks of the architecture itself (self explaining)

In [9]:
metrics_names = [
    "metrics.test/sparsity_mask",
    "metrics.test/AUROC",
    "metrics.test/loss",
    "metrics.test/mean/avg_score_pred_inactive/tabnet"
    "metrics.test/mean/avg_score_pred_inactive/integrated_gradients",
    "metrics.test/mean/avg_score_pred_inactive/saliency",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns].iloc[1:]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tabnet,30.0,0.533026,0.05802,0.448488,0.48306,0.517186,0.579153,0.689873
sparsity_mask,30.0,0.996008,0.001894,0.990547,0.996139,0.996411,0.996635,0.998967
saliency,30.0,0.537266,0.028899,0.510379,0.522063,0.528773,0.537348,0.661744
AUROC,30.0,0.761266,0.105468,0.525076,0.688073,0.822147,0.833865,0.837871
loss,30.0,0.573525,0.06086,0.507248,0.53556,0.537086,0.63271,0.691465
integrated_gradients,30.0,0.621072,0.054105,0.518021,0.606275,0.622756,0.632444,0.726453


#### Correlation

- Correlation between various metrics

In [10]:
metrics.corr(method="pearson")

Unnamed: 0,tabnet,sparsity_mask,saliency,AUROC,loss,integrated_gradients
tabnet,1.0,-0.178403,0.33669,0.299701,-0.239761,0.296681
sparsity_mask,-0.178403,1.0,-0.426184,0.482734,-0.498536,0.134538
saliency,0.33669,-0.426184,1.0,-0.096785,0.124812,0.564019
AUROC,0.299701,0.482734,-0.096785,1.0,-0.98497,0.350012
loss,-0.239761,-0.498536,0.124812,-0.98497,1.0,-0.350423
integrated_gradients,0.296681,0.134538,0.564019,0.350012,-0.350423,1.0


In [30]:
x = "AUROC"
y = "integrated_gradients"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart

In [29]:
x = "AUROC"
y = "tabnet"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart

In [28]:
x = "sparsity_mask"
y = "tabnet"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart

In [32]:
x = "sparsity_mask"
y = "AUROC"

chart = alt.Chart(metrics).mark_point().encode(
    x=alt.X(x, scale=alt.Scale(zero=False)),
    y=alt.Y(y, scale=alt.Scale(zero=False))
)

chart = chart + chart.transform_regression(x, y).mark_line()
#chart.show()
chart


## Experiment Best MLP (kfold cross validated)

- Using best parameters found (see above)
    - Run 5-fold cross validation (was run 4 times)
    - Results/metrics were calculated using the mean of all individual runs (4 * 5 = 20 runs)
    - Per 5-fold cross validation (4 times) the seed for model training and splitting was chosen randomly

In [15]:
experiment_name = "herg_tn_best_kfold"
exp = mlflow.get_experiment_by_name(experiment_name)

query = "tags.mlflow.runName != 'summary'"
runs = mlflow.search_runs(experiment_ids=exp.experiment_id, filter_string=query)

### Seeds


In [16]:
runs[["run_id", "params.seed", "params.split_seed"]].head(n=99)

Unnamed: 0,run_id,params.seed,params.split_seed
0,51a039063ad14c0185c672417902e820,921898046,921898046
1,41c6cf1276a0423495040508f76c7435,921898046,921898046
2,03a38ebff501405fa0af1dac06404c9c,921898046,921898046
3,6d5c932f8ade44358c85d823c1376b27,921898046,921898046
4,0cabbea6c8484bbfba22d81d181ddd34,921898046,921898046
5,de797c4f36364d0eb690644878e3fa13,4119919135,4119919135
6,14617c84fefe47d6b67f3d5b22cfe1cf,4119919135,4119919135
7,bf14a218eb664ff9a4534a64b3407e93,4119919135,4119919135
8,377cfd34c33744d99ad52050a5ba3679,4119919135,4119919135
9,426142d03db44bffa3e80ccdb4f2fa98,4119919135,4119919135


### Parameters

In [17]:
params = [c for c in runs.columns if "search_space" not in c and "params" in c]
for p, v in zip(params, runs[params].iloc[0].head(n=99)):
    print(f"{p.replace('params.', '')}: {v}")

prepare_data_per_node: True
track_grad_norm: -1.0
featurizer_name: combined
checkpoint_minimize: True
standardize: False
check_val_every_n_epoch: 1
optimizer_params/weight_decay: 0.0001
split_seed: 921898046
run_name: tn
categorical_indices: None
weights_summary: top
num_sanity_val_steps: inf
verbose_evaluate: True
patience_objective: val/loss
split_type: random_kfold
patience_minimize: True
relaxation_type: gamma_fixed
cache_dir: ../../../data/herg/
objective_name: val/loss
stochastic_weight_avg: False
log_sparsity: True
experiment_name: herg_tn_best_kfold
feature_size: 32
scheduler_params/decay_rate: 0.9
smile3: c1ccccc1CN2CCCCC2 - active
virtual_batch_size: 64
minimize: True
alpha: 2.0
precision: 32
patience: 10
decision_size: 16
seed: 921898046
reload_dataloaders_every_n_epochs: 0
smile2: c1ccccc1CCNC - active
max_steps: 1000
num_classes: 2
attribution_kwargs/data_types: ['test']
num_training_batches: 0
split_size: (5, 4, 0)
featurizer_kwargs/fold: 1024
limit_predict_batches: 1.0
s

### Metrics

#### Performance on hERG dataset

In [23]:
metrics_names = [
    "metrics.test/AUROC",
    "metrics.test/loss",
    "metrics.test/sparsity_mask"
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sparsity_mask,20.0,0.996377,0.000364,0.995665,0.99612,0.996388,0.996595,0.997087
AUROC,20.0,0.82581,0.017265,0.793626,0.816415,0.825899,0.831454,0.85935
loss,20.0,0.531627,0.026284,0.495905,0.514613,0.528472,0.546957,0.59345


##### Discussion

- The average sparsity (over the test set) for all trained model is very high. From 2017 features only about 7-8 features are used per average!
- AUROC is lower compared to MLP or RF

#### Active Hergophores on Inactive Model Predictions

- Atomic attribution using various methods

In [19]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_inactive/tabnet",
    "metrics.test/mean/avg_score_pred_inactive/integrated_gradients",
    "metrics.test/mean/avg_score_pred_inactive/saliency",
    "metrics.test/mean/avg_score_pred_inactive/saliency-absolute",
    "metrics.test/mean/avg_score_pred_inactive/input_x_gradient",
    "metrics.test/mean/avg_score_pred_inactive/occlusion",
    "metrics.test/mean/avg_score_pred_inactive/deeplift",
    "metrics.test/mean/avg_score_pred_inactive/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_inactive/noise_tunnel_ig",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
metrics.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tabnet,20.0,0.485733,0.061296,0.429961,0.450613,0.464963,0.508357,0.685107
input_x_gradient,20.0,0.611016,0.04137,0.523053,0.596375,0.60946,0.636457,0.683403
saliency,20.0,0.53431,0.02369,0.493869,0.515819,0.534421,0.548638,0.58804
noise_tunnel_ig,20.0,0.520508,0.018934,0.488566,0.509283,0.516547,0.53143,0.567675
saliency-absolute,20.0,0.468421,0.024849,0.430491,0.455058,0.461126,0.475507,0.526042
integrated_gradients,20.0,0.63742,0.069589,0.465502,0.593297,0.649511,0.686302,0.721648
occlusion,20.0,0.610158,0.04565,0.480663,0.597129,0.611852,0.636196,0.671489
shapley_value_sampling,20.0,0.649415,0.069913,0.451552,0.626492,0.663796,0.692717,0.727581


##### Discussion

- Integrated gradients and Shapley Value Sampling performs best
- Variance is high among all methods 
- Tabnet Variance is high - including extrem performance/values almost similar to that of integrated gradients

#### Active Hergophores on Active Model Predictions

- Atomic attribution using various methods

In [25]:
metrics_names = [
    "metrics.test/mean/avg_score_pred_active/tabnet",
    "metrics.test/mean/avg_score_pred_active",
    "metrics.test/mean/avg_score_pred_active/integrated_gradients",
    "metrics.test/mean/avg_score_pred_active/saliency",
    "metrics.test/mean/avg_score_pred_active/saliency-absolute",
    "metrics.test/mean/avg_score_pred_active/input_x_gradient",
    "metrics.test/mean/avg_score_pred_active/occlusion",
    "metrics.test/mean/avg_score_pred_active/deeplift",
    "metrics.test/mean/avg_score_pred_active/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_active/noise_tunnel_ig",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = [c.split("/")[-1] for c in metrics.columns]
#metrics.describe().T

#### Correlations

In [34]:
metrics_names = [
    "metrics.test/AUROC",
    "metrics.test/loss",
    "metrics.test/sparsity_mask"
]

metrics_names += [
    "metrics.test/mean/avg_score_pred_inactive/tabnet",
    "metrics.test/mean/avg_score_pred_inactive/integrated_gradients",
    "metrics.test/mean/avg_score_pred_inactive/saliency",
    "metrics.test/mean/avg_score_pred_inactive/saliency-absolute",
    "metrics.test/mean/avg_score_pred_inactive/input_x_gradient",
    "metrics.test/mean/avg_score_pred_inactive/occlusion",
    "metrics.test/mean/avg_score_pred_inactive/deeplift",
    "metrics.test/mean/avg_score_pred_inactive/shapley_value_sampling",
    "metrics.test/mean/avg_score_pred_inactive/noise_tunnel_ig",
]
metrics_columns = [c for c in runs.columns if any(c in m for m in metrics_names)]
metrics = runs[metrics_columns]

metrics.columns = sort([c.split("/")[-1] for c in metrics.columns])
metrics.corr(method="kendall")

NameError: name 'sort' is not defined