<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/test_bench/TestBench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
IN_COLAB = True

try:
  import google.colab
  # Using Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

  !git clone https://github.com/evillag/uncertainty_gan.git
  !mv uncertainty_gan/mcd .
  !mv uncertainty_gan/feature_densities .
  !mv uncertainty_gan/test_bench .
  %rm -rf uncertainty_gan/

  !git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
  !mv lhcb-rich-gan-uncertainty/experiments .
  !mv lhcb-rich-gan-uncertainty/src .
  %rm -rf lhcb-rich-gan-uncertainty/
  %rm -rf sample_data/
  %pip install tensorflow-addons

  # Dataset download and extraction
  !unzip -qq drive/MyDrive/cern/data/rich.zip

  # Model checkpoint download and extraction
  !unzip -qq drive/MyDrive/cern/data/checkpoints_dropout_0.01.zip

  # Model embeddings download and extraction
  !unzip -qq drive/MyDrive/cern/data/embeddings.zip

  # Results folder creation
  !mkdir /content/drive/MyDrive/cern/data/results

except:
  IN_COLAB = False

print(f'IN_COLAB: {IN_COLAB}')

IN_COLAB: False


In [2]:
import numpy as np
import tensorflow as tf

if not IN_COLAB:
  import os
  os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  # Ask Tensorflow to use GPU memory judiciously on single GPU system
  physical_devices = tf.config.list_physical_devices('GPU')  
  tf.config.experimental.set_memory_growth(physical_devices[0], True)

from test_bench import get_checkpoint_name, load_particle_datasets, subsample_dataset
from test_bench.model import MonteCarloDropoutModel


# Test Bench for the Monte Carlo Dropout and Feature Density methods

1. Select sample data
2. Create a model
3. Generate a single target with single inference mode
4. Estimate MCD uncertainty
5. Estimate FD uncertainty

In [3]:
# Parameters
PARTICLE = 'proton'
CHECKPOINT_DP = 0.0001
DROPOUT_TYPE = 'bernoulli_structured'
CHECKPOINT_BASE = 'checkpoints/'
DATA_DIR = 'rich/'
SUB_SAMPLE_PERCENT = 0.0001

# MCD parameters
MCD_ENSEMBLE_SIZE = 300

# FD parameters
embeddings_dir = f'embeddings/'

# Save results path
output_dir = 'results/'
if IN_COLAB:
  output_dir = f'/content/drive/MyDrive/cern/data/{output_dir}'

# Load data and Sample selection

In [4]:
dataset = load_particle_datasets(PARTICLE, DATA_DIR)

Reading and concatenating datasets:
	rich\proton_+_down_2016_.csv
	rich\proton_+_up_2016_.csv
	rich\proton_-_down_2016_.csv
	rich\proton_-_up_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 1000000
scaler n_quantiles: 100000, time = 0.7408425807952881
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(454724, 3)
targets_train shape	(454724, 5)
feats_val shape  	(272832, 3)
targets_val shape	(272832, 5)



In [5]:
# Draw a sample of the datasets
x_sample, y_sample = subsample_dataset(dataset['feats_val'], dataset['targets_val'], SUB_SAMPLE_PERCENT)
x_sample.shape, y_sample.shape

(TensorShape([27, 3]), TensorShape([27, 5]))

# Model creation

In [6]:
model = MonteCarloDropoutModel(
    PARTICLE,
    dropout_rate=CHECKPOINT_DP,
    checkpoint_dir=CHECKPOINT_BASE + get_checkpoint_name(PARTICLE, CHECKPOINT_DP, DROPOUT_TYPE),
    debug=True
)
generator = model.get_generator()

Generating model for proton with a dropout rate of 0.0001
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           

## Single model prediction

In [7]:
generator.single_model_inference_mode()
t_generated = generator.predict(x_sample)

print('Generated target:')
print(t_generated)


Generated target:
[[-0.08103226 -0.12261854 -0.04804504  0.09772559  0.01369808]
 [-0.23752145 -0.1427123  -0.13845053 -0.03081239  0.11991075]
 [-0.35803813 -0.17356156 -0.1939515   0.27882186  0.10709971]
 [-0.1406405  -0.0327857   0.06726847  0.26267052  0.0725523 ]
 [-0.29181573 -0.18854555  0.04417044  0.17666477 -0.03244647]
 [-0.14478582 -0.11270503 -0.12632048  0.03210034  0.01816113]
 [-0.16994928 -0.11837488 -0.04517685 -0.01940154  0.03906797]
 [-0.3155026  -0.06509914  0.07237132  0.26501042  0.06209607]
 [-0.13202992 -0.17113003  0.03348824  0.17486927 -0.01286009]
 [-0.04872167 -0.1624827  -0.0034431   0.11652523 -0.12475193]
 [-0.12677765 -0.05199048  0.06095138 -0.03762685 -0.02320172]
 [-0.2569429  -0.10069627 -0.0960131   0.02658819  0.0396785 ]
 [-0.20108688 -0.05038339 -0.08040392 -0.02623529 -0.01493893]
 [-0.3254113  -0.1230049  -0.01371121  0.16174427 -0.01058929]
 [-0.09570699 -0.1695442  -0.0006408   0.08017218  0.07097784]
 [-0.16890696 -0.087522   -0.06962632

Euclidean distance between real and generated targets

In [8]:
real_target = tf.constant(y_sample)
generated_target = tf.constant(t_generated)

# Compute the Euclidean distance (L2 norm) between real and generated
distance_single_pred = tf.norm(real_target - generated_target, axis=1)
print('Euclidean distance (L2 norm) between real and generated targets:')
print(distance_single_pred)

Euclidean distance (L2 norm) between real and generated targets:
tf.Tensor(
[1.0750118  1.5217443  2.343853   2.7050796  1.4576181  3.4827523
 1.5568006  1.415955   0.99176174 1.3966324  0.9213318  3.766989
 1.1762645  1.3272269  2.535774   2.375632   1.7997671  2.2176425
 3.6968408  2.3918388  1.6772178  1.7729249  1.1383957  0.89883566
 1.3817779  2.5438013  1.2953957 ], shape=(27,), dtype=float32)


In [9]:
# Save real and generated targets
np.save(output_dir + f'{PARTICLE}_t_generated.npy', t_generated)
np.save(output_dir + f'{PARTICLE}_y_real.npy', y_sample)

## Monte Carlo Dropout method

In [10]:
from mcd.MCDEvaluator import MCDEvaluator

mcd_evaluator = MCDEvaluator(model, MCD_ENSEMBLE_SIZE)
mcd_uncertainty, _ = mcd_evaluator.evaluate(x_sample)
mcd_uncertainty


Generating ensemble(300) predictions


100%|██████████| 300/300 [00:04<00:00, 60.24it/s]


<tf.Tensor: shape=(27, 5), dtype=float32, numpy=
array([[0.01076716, 0.01232317, 0.00806856, 0.00986874, 0.00755464],
       [0.01440388, 0.01194277, 0.00957286, 0.010514  , 0.00940598],
       [0.01395028, 0.01180591, 0.0098174 , 0.0110699 , 0.00786071],
       [0.011423  , 0.01100715, 0.00773758, 0.00872663, 0.00626604],
       [0.01069109, 0.01177249, 0.00697898, 0.00870381, 0.00698918],
       [0.01283105, 0.01293545, 0.00773827, 0.01318455, 0.00818158],
       [0.01211472, 0.01275276, 0.00991683, 0.01055802, 0.00782606],
       [0.01179581, 0.01249789, 0.00789643, 0.0093854 , 0.008521  ],
       [0.01006566, 0.01166148, 0.00705296, 0.00979996, 0.00692104],
       [0.01297649, 0.01212419, 0.00830741, 0.00867565, 0.00822749],
       [0.01147509, 0.01327143, 0.0081468 , 0.01220207, 0.0092034 ],
       [0.01256346, 0.01391346, 0.00856029, 0.00957174, 0.00809941],
       [0.01276435, 0.01139864, 0.00766933, 0.01036213, 0.00775571],
       [0.01123512, 0.01141803, 0.00703333, 0.00969511

In [11]:
mcd_uncertainty.shape

TensorShape([27, 5])

In [12]:
# Save MCD uncertainties
np.save(output_dir + f'{PARTICLE}_mcd_uncertainty.npy', mcd_uncertainty)

## Feature Densities method

In [13]:
from feature_densities.feature_density_evaluator import FeatureDensityEvaluator

train_embeddings = np.load(embeddings_dir + f'{PARTICLE}_train_embeddings.npy')
print(train_embeddings.shape)

(454724, 128)


In [8]:
fd_evaluator = FeatureDensityEvaluator(
  model, train_embeddings, likelihood_method='integration', use_tf_version=True)

(454724, 128)
Generating an embeddings model
Fitting KDE functions to known embeddings


In [14]:
fd_uncertainty_integration, _ = fd_evaluator.evaluate(x_sample)

print('Feature Densities using INTEGRATION uncertainty score for x_sample:')
fd_uncertainty_integration

Calculating sample´s embeddings
Estimating sample´s feature densities
Calculating likelihoods with integration method


100%|██████████| 128/128 [01:45<00:00,  1.22it/s]

Feature Densities using INTEGRATION uncertainty score for x_sample:





<tf.Tensor: shape=(27,), dtype=float32, numpy=
array([0.97046876, 0.9696325 , 0.97154456, 0.96630245, 0.97286636,
       0.9746166 , 0.96553075, 0.9684683 , 0.9654089 , 0.97142977,
       0.96899945, 0.9692397 , 0.9694347 , 0.9637549 , 0.97479284,
       0.96872807, 0.9693329 , 0.9683467 , 0.96955067, 0.9686049 ,
       0.970806  , 0.97706944, 0.96915203, 0.9695235 , 0.9700593 ,
       0.97384274, 0.9706849 ], dtype=float32)>

In [15]:
# Save FD uncertainties with integration
np.save(output_dir + f'{PARTICLE}_fd_uncertainty_integration.npy', fd_uncertainty_integration)

Test with a training sample 

In [26]:
x_sample_train, y_sample_train = subsample_dataset(dataset['feats_train'], dataset['targets_train'], SUB_SAMPLE_PERCENT)

fd_uncertainty_integration_x_sample_train, _ = fd_evaluator.evaluate(x_sample_train)

print('Feature Densities using INTEGRATION uncertainty score for x_sample_train:')
fd_uncertainty_integration_x_sample_train

Fitting KDE functions to known embeddings
Calculating sample´s embeddings
Estimating sample´s feature densities
Calculating likelihoods with integration method


100%|██████████| 128/128 [02:58<00:00,  1.39s/it]

Feature Densities using INTEGRATION uncertainty score for x_sample_train:





<tf.Tensor: shape=(45,), dtype=float32, numpy=
array([0.9619562 , 0.96834004, 0.9704213 , 0.9669212 , 0.96809244,
       0.95906466, 0.9657856 , 0.96944016, 0.9678844 , 0.97052383,
       0.96470207, 0.9690555 , 0.9670368 , 0.9726632 , 0.9664482 ,
       0.9681256 , 0.96320534, 0.9627804 , 0.97448444, 0.97457176,
       0.9735359 , 0.9690675 , 0.9704136 , 0.96173954, 0.95964956,
       0.9675619 , 0.96438915, 0.9686352 , 0.9587869 , 0.9691076 ,
       0.9699587 , 0.9688715 , 0.9619854 , 0.96912503, 0.97349656,
       0.9660821 , 0.9749091 , 0.9677489 , 0.9679755 , 0.97203887,
       0.9651049 , 0.9608933 , 0.96106386, 0.9678883 , 0.9695483 ],
      dtype=float32)>

#### FD uncertainty estimation using normalized likelihood

In [15]:
fd_evaluator_normalized = FeatureDensityEvaluator(
  model, train_embeddings, use_tf_version=True, likelihood_method='normalized')

fd_uncertainty_normalized, _ = fd_evaluator_normalized.evaluate(x_sample)

print('Feature Densities using NORMALIZED uncertainty score for x_sample:')
fd_uncertainty_normalized

Generating an embeddings model
Fitting KDE functions to known embeddings
Calculating sample´s embeddings
Estimating sample´s feature densities
Calculating normalized likelihoods


  0%|          | 0/128 [01:12<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Save FD uncertainties normalized
np.save(output_dir + f'{PARTICLE}_fd_uncertainty_normalized.npy', fd_uncertainty_normalized)

### Uncertainty visualization

### Generation of FD embeddings

In [None]:
# from feature_densities.feature_density_evaluator import create_embeddings_model
# embeddings_model = create_embeddings_model(model)

In [None]:
# train_embeddings, train_predictions = embeddings_model.predict(dataset['feats_train'])

In [None]:
# test_embeddings, test_predictions = embeddings_model.predict(dataset['feats_val'])

In [None]:
# !rm -r embeddings
# !mkdir embeddings

# np.save(embeddings_dir + f'{PARTICLE}_train_embeddings.npy', train_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_train_predictions.npy', train_predictions)
# np.save(embeddings_dir + f'{PARTICLE}_test_embeddings.npy', test_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_test_predictions.npy', test_predictions)