<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/test_bench/TestBench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
IN_COLAB = True

try:
  import google.colab
  # Using Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

  !git clone https://github.com/evillag/uncertainty_gan.git
  !mv uncertainty_gan/mcd .
  !mv uncertainty_gan/feature_densities .
  !mv uncertainty_gan/test_bench .
  %rm -rf uncertainty_gan/

  !git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
  !mv lhcb-rich-gan-uncertainty/experiments .
  !mv lhcb-rich-gan-uncertainty/src .
  %rm -rf lhcb-rich-gan-uncertainty/
  %rm -rf sample_data/
  %pip install tensorflow-addons

  # Dataset download and extraction
  !unzip -qq drive/MyDrive/cern/data/rich.zip

  # Model checkpoint download and extraction
  !unzip -qq drive/MyDrive/cern/data/checkpoints_dropout_0.01.zip

  # Model embeddings download and extraction
  !unzip -qq drive/MyDrive/cern/data/embeddings.zip

except:
  IN_COLAB = False

print(f'IN_COLAB: {IN_COLAB}')

Mounted at /content/drive
Cloning into 'uncertainty_gan'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 123 (delta 58), reused 50 (delta 17), pack-reused 0[K
Receiving objects: 100% (123/123), 17.33 MiB | 10.60 MiB/s, done.
Resolving deltas: 100% (58/58), done.
Cloning into 'lhcb-rich-gan-uncertainty'...
remote: Enumerating objects: 210, done.[K
remote: Total 210 (delta 0), reused 0 (delta 0), pack-reused 210 (from 1)[K
Receiving objects: 100% (210/210), 2.94 MiB | 20.35 MiB/s, done.
Resolving deltas: 100% (94/94), done.
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-n

In [2]:
import numpy as np

from test_bench import get_checkpoint_name, load_particle_datasets, subsample_dataset
from test_bench.model import MonteCarloDropoutModel


# Test Bench for the Monte Carlo Dropout and Feature Density methods

1. Select sample data
2. Create a model
3. Generate a single target with single inference mode
4. Estimate MCD uncertainty
5. Estimate FD uncertainty

In [5]:
# Parameters
PARTICLE = 'pion'
CHECKPOINT_DP = 0.01
DROPOUT_TYPE = 'bernoulli_structured'
CHECKPOINT_BASE = 'checkpoints/'
DATA_DIR = 'rich/'
SUB_SAMPLE_PERCENT = 0.02

# MCD parameters
MCD_ENSEMBLE_SIZE = 300

#FD parameters
embeddings_dir = f'embeddings/'

# Load data and Sample selection

In [4]:
dataset = load_particle_datasets(PARTICLE, DATA_DIR)

Reading and concatenating datasets:
	rich/pion_-_down_2016_.csv
	rich/pion_+_up_2016_.csv
	rich/pion_-_up_2016_.csv
	rich/pion2_+_down_2016_.csv
	rich/pion2_-_up_2016_.csv
	rich/pion_+_down_2016_.csv
	rich/pion2_+_up_2016_.csv
	rich/pion2_-_down_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 2000000
scaler n_quantiles: 100000, time = 2.3905060291290283
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(948527, 3)
targets_train shape	(948527, 5)
feats_val shape  	(526449, 3)
targets_val shape	(526449, 5)



In [6]:
# Draw a sample of the datasets
x_sample, y_sample = subsample_dataset(dataset['feats_val'], dataset['targets_val'], SUB_SAMPLE_PERCENT)
x_sample.shape, y_sample.shape

(TensorShape([10528, 3]), TensorShape([10528, 5]))

# Model creation

In [7]:
model = MonteCarloDropoutModel(
    PARTICLE,
    dropout_rate=CHECKPOINT_DP,
    checkpoint_dir=CHECKPOINT_BASE + get_checkpoint_name(PARTICLE, CHECKPOINT_DP, DROPOUT_TYPE),
    debug=True
)
generator = model.get_generator()

Generating model for pion with a dropout rate of 0.01
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           0   

## Single model prediction

In [8]:
generator.single_model_inference_mode()
t_generated = generator.predict(x_sample)
t_generated



array([[-0.21959116, -0.9212151 , -0.22168373, -0.7220314 , -0.81101686],
       [ 1.1004065 , -0.8272944 ,  1.1263113 , -0.19565034,  0.22770289],
       [ 1.0872381 , -0.47873074,  0.6419121 , -0.82812566, -1.073178  ],
       ...,
       [-0.11386883, -1.057545  ,  0.06267808, -0.86642975, -0.93954736],
       [ 1.2638797 , -0.9108536 ,  1.1304615 , -0.7055115 , -0.4810682 ],
       [ 0.4428744 ,  0.08550289,  0.2506917 , -0.36984533,  1.4528008 ]],
      dtype=float32)

## Monte Carlo Dropout method

In [10]:
from mcd.MCDEvaluator import evaluate_model as mcd_evaluate_model

mcd_uncertainty, _ =  mcd_evaluate_model(model, x_sample, MCD_ENSEMBLE_SIZE)
mcd_uncertainty


Generating ensemble(300) predictions


100%|██████████| 300/300 [00:11<00:00, 25.02it/s]


<tf.Tensor: shape=(10528, 5), dtype=float32, numpy=
array([[0.32179454, 0.56514   , 0.54485375, 0.5963936 , 0.6756564 ],
       [0.3774602 , 0.38062468, 0.32571095, 0.4942838 , 0.82462764],
       [0.13380627, 0.3833909 , 0.10821614, 0.6469189 , 0.7369904 ],
       ...,
       [0.3395108 , 0.45304912, 0.5515006 , 0.4815188 , 0.561837  ],
       [0.3831384 , 0.4118706 , 0.42855138, 0.5131822 , 0.69099325],
       [0.26160944, 0.27090713, 0.18020515, 0.40333822, 1.0599014 ]],
      dtype=float32)>

In [11]:
mcd_uncertainty.shape

TensorShape([10528, 5])

## Feature Densities method

### Uncertainty estimation

In [None]:
from feature_densities.feature_density_evaluator import evaluate_model as fd_evaluate_model

train_embeddings = np.load(embeddings_dir + f'{PARTICLE}_train_embeddings.npy')
print(train_embeddings.shape)

fd_uncertainty, _ = fd_evaluate_model(model, x_sample, known_embeddings=train_embeddings, likelihood_method='integration')

print('Feature Densities using INTEGRATION uncertainty score for x_sample:')
fd_uncertainty

(948325, 128)
Generating an embeddings model
Fitting KDE functions to known embeddings
Calculating sample´s embeddings
Estimating sample´s feature densities
Calculating likelihoods with integration method


  0%|          | 0/128 [00:00<?, ?it/s]

In [None]:
fd_uncertainty, _ = fd_evaluate_model(model, x_sample, known_embeddings=train_embeddings, likelihood_method='normalized')

print('Feature Densities using NORMALIZED uncertainty score for x_sample:')
fd_uncertainty

### Generation of FD embeddings

In [None]:
# from feature_densities.feature_density_evaluator import create_embeddings_model
# embeddings_model = create_embeddings_model(model)

In [None]:
# train_embeddings, train_predictions = embeddings_model.predict(dataset['feats_train'])

In [None]:
# test_embeddings, test_predictions = embeddings_model.predict(dataset['feats_val'])

In [None]:
# !rm -r embeddings
# !mkdir embeddings

# np.save(embeddings_dir + f'{PARTICLE}_train_embeddings.npy', train_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_train_predictions.npy', train_predictions)
# np.save(embeddings_dir + f'{PARTICLE}_test_embeddings.npy', test_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_test_predictions.npy', test_predictions)