<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/test_bench/TestBench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

mkdir: cannot create directory ‘/content/drive/MyDrive/cern/data/results’: File exists


In [1]:
IN_COLAB = True

try:
  import google.colab
  # Using Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

  !git clone https://github.com/evillag/uncertainty_gan.git
  !mv uncertainty_gan/mcd .
  !mv uncertainty_gan/feature_densities .
  !mv uncertainty_gan/test_bench .
  %rm -rf uncertainty_gan/

  !git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
  !mv lhcb-rich-gan-uncertainty/experiments .
  !mv lhcb-rich-gan-uncertainty/src .
  %rm -rf lhcb-rich-gan-uncertainty/
  %rm -rf sample_data/
  %pip install tensorflow-addons

  # Dataset download and extraction
  !unzip -qq drive/MyDrive/cern/data/rich.zip

  # Model checkpoint download and extraction
  !unzip -qq drive/MyDrive/cern/data/checkpoints_dropout_0.01.zip

  # Model embeddings download and extraction
  !unzip -qq drive/MyDrive/cern/data/embeddings.zip

  # Results folder creation
  !mkdir /content/drive/MyDrive/cern/data/results

except:
  IN_COLAB = False

print(f'IN_COLAB: {IN_COLAB}')

Mounted at /content/drive
Cloning into 'uncertainty_gan'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (107/107), done.[K
remote: Total 131 (delta 64), reused 52 (delta 19), pack-reused 0[K
Receiving objects: 100% (131/131), 17.34 MiB | 10.81 MiB/s, done.
Resolving deltas: 100% (64/64), done.
Cloning into 'lhcb-rich-gan-uncertainty'...
remote: Enumerating objects: 210, done.[K
remote: Total 210 (delta 0), reused 0 (delta 0), pack-reused 210 (from 1)[K
Receiving objects: 100% (210/210), 2.94 MiB | 8.75 MiB/s, done.
Resolving deltas: 100% (94/94), done.
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-no

In [2]:
import numpy as np

from test_bench import get_checkpoint_name, load_particle_datasets, subsample_dataset
from test_bench.model import MonteCarloDropoutModel


# Test Bench for the Monte Carlo Dropout and Feature Density methods

1. Select sample data
2. Create a model
3. Generate a single target with single inference mode
4. Estimate MCD uncertainty
5. Estimate FD uncertainty

In [25]:
# Parameters
PARTICLE = 'pion'
CHECKPOINT_DP = 0.01
DROPOUT_TYPE = 'bernoulli_structured'
CHECKPOINT_BASE = 'checkpoints/'
DATA_DIR = 'rich/'
SUB_SAMPLE_PERCENT = 0.0001

# MCD parameters
MCD_ENSEMBLE_SIZE = 300

# FD parameters
embeddings_dir = f'embeddings/'

# Save results path
output_dir = 'results/'
if IN_COLAB:
  output_dir = f'/content/drive/MyDrive/cern/data/{output_dir}'

# Load data and Sample selection

In [4]:
dataset = load_particle_datasets(PARTICLE, DATA_DIR)

Reading and concatenating datasets:
	rich/pion_-_down_2016_.csv
	rich/pion_+_up_2016_.csv
	rich/pion_-_up_2016_.csv
	rich/pion2_+_down_2016_.csv
	rich/pion2_-_up_2016_.csv
	rich/pion_+_down_2016_.csv
	rich/pion2_+_up_2016_.csv
	rich/pion2_-_down_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 2000000
scaler n_quantiles: 100000, time = 2.3753974437713623
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(948527, 3)
targets_train shape	(948527, 5)
feats_val shape  	(526449, 3)
targets_val shape	(526449, 5)



In [26]:
# Draw a sample of the datasets
x_sample, y_sample = subsample_dataset(dataset['feats_val'], dataset['targets_val'], SUB_SAMPLE_PERCENT)
x_sample.shape, y_sample.shape

(TensorShape([52, 3]), TensorShape([52, 5]))

# Model creation

In [6]:
model = MonteCarloDropoutModel(
    PARTICLE,
    dropout_rate=CHECKPOINT_DP,
    checkpoint_dir=CHECKPOINT_BASE + get_checkpoint_name(PARTICLE, CHECKPOINT_DP, DROPOUT_TYPE),
    debug=True
)
generator = model.get_generator()

Generating model for pion with a dropout rate of 0.01
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           0   

## Single model prediction

In [27]:
generator.single_model_inference_mode()
t_generated = generator.predict(x_sample)
t_generated



array([[ 1.71494377e+00, -8.75371277e-01,  1.38217902e+00,
        -1.79405713e+00, -1.20285940e+00],
       [-8.63066077e-01, -5.68030655e-01, -3.46026927e-01,
        -4.68910396e-01, -5.03679872e-01],
       [-3.78945112e-01, -1.95473480e+00, -4.04925525e-01,
        -1.53682017e+00, -1.62722826e+00],
       [-7.26419032e-01, -1.88643575e+00, -3.21941942e-01,
        -1.25430036e+00, -1.37325430e+00],
       [ 2.02215552e+00, -1.96952581e+00,  1.80116665e+00,
        -1.58964348e+00, -1.64682269e+00],
       [ 8.42753232e-01, -9.59526956e-01,  1.06354046e+00,
        -4.87225056e-01, -1.00511372e-01],
       [-5.34297824e-01, -1.86930275e+00, -2.22154126e-01,
        -1.45167756e+00, -1.59293771e+00],
       [ 6.89221859e-01, -1.10910416e+00,  1.15184903e+00,
        -1.10462689e+00, -7.00429618e-01],
       [-1.08927584e+00,  1.23493612e-01, -1.47566342e+00,
         2.55021811e-01,  2.40261644e-01],
       [ 1.17554951e+00,  1.13552022e+00,  4.78784591e-01,
         2.42643428e+00

In [28]:
# Save real and generated targets
np.save(output_dir + f'{PARTICLE}_t_generated.npy', t_generated)
np.save(output_dir + f'{PARTICLE}_y_real.npy', y_sample)

## Monte Carlo Dropout method

In [29]:
from mcd.MCDEvaluator import evaluate_model as mcd_evaluate_model

mcd_uncertainty, _ =  mcd_evaluate_model(model, x_sample, MCD_ENSEMBLE_SIZE)
mcd_uncertainty


Generating ensemble(300) predictions


100%|██████████| 300/300 [00:09<00:00, 31.31it/s]


<tf.Tensor: shape=(52, 5), dtype=float32, numpy=
array([[0.11138407, 0.32759705, 0.08113888, 0.43188152, 0.8597005 ],
       [0.46332255, 0.41670755, 0.784654  , 0.4325334 , 0.47862163],
       [0.43393648, 0.5726564 , 0.61083406, 0.6372936 , 0.7354951 ],
       [0.43797037, 0.45652518, 0.6370475 , 0.4589556 , 0.5417607 ],
       [0.39896107, 0.4032338 , 0.3709528 , 0.46522623, 0.71632975],
       [0.38586852, 0.44261652, 0.3861187 , 0.6537902 , 0.9874884 ],
       [0.3414087 , 0.5845458 , 0.56717235, 0.53932136, 0.6436421 ],
       [0.21178134, 0.25543192, 0.14722744, 0.3583195 , 1.0692273 ],
       [0.23356725, 0.36762312, 0.31954572, 0.37006164, 0.37459737],
       [0.2843924 , 0.34751424, 0.2608903 , 0.5008928 , 0.86389905],
       [0.4463591 , 0.43958482, 0.51119494, 0.47689703, 0.6413875 ],
       [0.25449702, 0.38027492, 0.5036256 , 0.42295074, 0.49591246],
       [0.3798613 , 0.5347673 , 0.64450645, 0.6072182 , 0.711764  ],
       [0.34339485, 0.42521304, 0.7455092 , 0.4423738 

In [30]:
mcd_uncertainty.shape

TensorShape([52, 5])

In [32]:
# Save MCD uncertainties
np.save(output_dir + f'{PARTICLE}_mcd_uncertainty.npy', mcd_uncertainty)

## Feature Densities method

### Uncertainty estimation

In [33]:
from feature_densities.feature_density_evaluator import evaluate_model as fd_evaluate_model

train_embeddings = np.load(embeddings_dir + f'{PARTICLE}_train_embeddings.npy')
print(train_embeddings.shape)

fd_uncertainty_integration, _ = fd_evaluate_model(model, x_sample, known_embeddings=train_embeddings, likelihood_method='integration')

print('Feature Densities using INTEGRATION uncertainty score for x_sample:')
fd_uncertainty_integration

(948325, 128)
Generating an embeddings model
Fitting KDE functions to known embeddings
Calculating sample´s embeddings
Estimating sample´s feature densities


100%|██████████| 52/52 [10:21<00:00, 11.95s/it]

Feature Densities using INTEGRATION uncertainty score for x_sample:





<tf.Tensor: shape=(52,), dtype=float64, numpy=
array([0.99696795, 0.98386122, 0.9930798 , 0.99029255, 0.99388116,
       0.99106039, 0.98962542, 0.99185572, 0.98346811, 0.99297008,
       0.99250436, 0.99122989, 0.99037003, 0.97890695, 0.99057272,
       0.99336079, 0.98162482, 0.9881128 , 0.98746889, 0.99346868,
       0.99161914, 0.98390812, 0.98245716, 0.98058026, 0.97937096,
       0.99569436, 0.98796848, 0.9909012 , 0.98491409, 0.99193857,
       0.98075132, 0.99061284, 0.99276764, 0.98459823, 0.98762967,
       0.9787404 , 0.98669493, 0.98364207, 0.98975249, 0.98866702,
       0.99224149, 0.9924707 , 0.99271423, 0.99475767, 0.99674197,
       0.99607984, 0.99230945, 0.98954991, 0.99243532, 0.99160392,
       0.98236469, 0.98252359])>

In [34]:
# Save FD uncertainties with integration
np.save(output_dir + f'{PARTICLE}_fd_uncertainty_integration.npy', fd_uncertainty_integration)

In [None]:
fd_uncertainty_normalized, _ = fd_evaluate_model(model, x_sample, known_embeddings=train_embeddings, likelihood_method='normalized')

print('Feature Densities using NORMALIZED uncertainty score for x_sample:')
fd_uncertainty_normalized

Generating an embeddings model
Fitting KDE functions to known embeddings
Calculating sample´s embeddings
Estimating sample´s feature densities


In [None]:
# Save FD uncertainties normalized
np.save(output_dir + f'{PARTICLE}_fd_uncertainty_normalized.npy', fd_uncertainty_normalized)

### Generation of FD embeddings

In [None]:
# from feature_densities.feature_density_evaluator import create_embeddings_model
# embeddings_model = create_embeddings_model(model)

In [None]:
# train_embeddings, train_predictions = embeddings_model.predict(dataset['feats_train'])

In [None]:
# test_embeddings, test_predictions = embeddings_model.predict(dataset['feats_val'])

In [None]:
# !rm -r embeddings
# !mkdir embeddings

# np.save(embeddings_dir + f'{PARTICLE}_train_embeddings.npy', train_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_train_predictions.npy', train_predictions)
# np.save(embeddings_dir + f'{PARTICLE}_test_embeddings.npy', test_embeddings)
# np.save(embeddings_dir + f'{PARTICLE}_test_predictions.npy', test_predictions)