<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/CERN_UE_Output_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

In [None]:
!git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
!mv lhcb-rich-gan-uncertainty/experiments .
!mv lhcb-rich-gan-uncertainty/src .
!rm -r lhcb-rich-gan-uncertainty/
!rm -r sample_data/
!pip install tensorflow-addons

Cloning into 'lhcb-rich-gan-uncertainty'...
remote: Enumerating objects: 210, done.[K
remote: Total 210 (delta 0), reused 0 (delta 0), pack-reused 210 (from 1)[K
Receiving objects: 100% (210/210), 2.94 MiB | 13.57 MiB/s, done.
Resolving deltas: 100% (94/94), done.
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3


In [None]:
from experiments.efficiency.uncertainty_model_train import train_model
from experiments.efficiency.uncertainty_models import uncertainty_mlp
from experiments.efficiency.uncertainty_utils import (
    efficiency_bands_with_uncertainty, efficiency_momentum_with_uncertainty)
from experiments.efficiency.utils import (
    efficiency_bands, efficiency_momentum, ensemble_and_ref_model_inference,
    ensemble_and_ref_model_inference_on_bands, tf_to_numpy_dataset,
    threshold_selection)
from src.cramer_gan_trainer import CramerGANTrainer
from src.dataset import CramerGANDataset
from src.datasets.utils_rich import (get_merged_typed_dataset,
                                     parse_dataset_np, parse_example)
from src.models.gans.discriminators.fcn_disc import RICHDiscriminator
from src.models.gans.generators.fcn_gen import RichMCDropFunc, VirtualEnsembleModel


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
# Dataset download and extraction
!unzip -qq drive/MyDrive/cern/data/rich.zip

In [None]:
# Model checkpoint download and extraction
!unzip -qq drive/MyDrive/cern/data/checkpoints_dropout_0.01.zip

In [None]:
PARTICLES = ['proton'] # ["pion", "kaon", "muon", "proton"]
DROPOUTS = [0.25, 0.3, 0.35, 0.4]
ENSEMBLES = [16, 32, 64, 128, 256]
NUM_REPS = 10
SUB_SAMPLE_SIZE = .3
THRESHOLD = 1.0

DATA_DIR = 'rich'
CHECKPOINT_BASE = 'checkpoints'
CKPT_NUMBER = 'ckpt-21'


def get_checkpoint_name(particle):
    return f'bernoulli_structured_dropout_line_test_cramer_drop_rate_0.01_{particle}'

In [None]:
def _split_by_line(df, slope=1, intercept=0):
    top_half = df[df['Brunel_ETA'] > df['Brunel_P'] * slope + intercept]
    bottom_half = df[df['Brunel_ETA'] <= df['Brunel_P'] * slope + intercept]

    top_half = top_half.reset_index(drop=True)
    bottom_half = bottom_half.reset_index(drop=True)

    return top_half, bottom_half


def split_by_line(df_train, df_test):
    return _split_by_line(df_train)[0], _split_by_line(df_test)[1]


def load_particle_datasets(particle, data_dir=DATA_DIR):
    """ The returned dictionary has this format:
        {
          "<particle_name>": {
            'data_train': data_train,
            'data_val': data_val,
            'scaler': scaler,
            'feats_train': feats_train,
            'targets_train': targets_train,
            'feats_val': feats_val,
            'targets_val': targets_val
          }
        }
    """
    data_train, data_val, scaler = get_merged_typed_dataset(data_dir, particle, dtype=np.float32, log=True,
                                                            sample_fn=split_by_line)
    feats_train, targets_train, _ = parse_dataset_np(data_train)
    feats_val, targets_val, _ = parse_dataset_np(data_val)

    print(f'feats_train shape\t{feats_train.shape}\n'
          f'targets_train shape\t{targets_train.shape}\n'
          f'feats_val shape  \t{feats_val.shape}\n'
          f'targets_val shape\t{targets_val.shape}\n')

    return {
        'data_train': data_train,
        'data_val': data_val,
        'scaler': scaler,
        'feats_train': feats_train,
        'targets_train': targets_train,
        'feats_val': feats_val,
        'targets_val': targets_val
    }

datasets = {particle: load_particle_datasets(particle) for particle in PARTICLES}

Reading and concatenating datasets:
	rich/proton_+_down_2016_.csv
	rich/proton_-_down_2016_.csv
	rich/proton_-_up_2016_.csv
	rich/proton_+_up_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 1000000
scaler n_quantiles: 100000, time = 1.9124493598937988
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(454690, 3)
targets_train shape	(454690, 5)
feats_val shape  	(272463, 3)
targets_val shape	(272463, 5)



In [None]:
class MonteCarloDroupoutModel:
    def __init__(self, particle, dropout_rate,
                 log_dir='log_dir_tmp',
                 checkpoint_dir=CHECKPOINT_BASE,
                 debug=False):

        self.particle = particle
        self.dropout_rate = dropout_rate
        self.log_dir = log_dir
        self.checkpoint_dir = checkpoint_dir

        print(f'Generating model for {particle} with a dropout rate of {dropout_rate}')

        self._gen_config = {
            'drop_rate': dropout_rate,
            'dropout_type': 'bernoulli',
        }

        self._generator = RichMCDropFunc(**self._gen_config)
        self._generator.build((None, 3))
        self._discriminator = RICHDiscriminator()

        if debug:
            print("\nGenerator:\n")
            print(self._generator.summary(line_length=96))
            print("\nDiscriminator:\n")
            print(self._discriminator.summary())
            print(f"\nCheckpoint path: {self.checkpoint_dir}\n")

        # Model was trained with tensorflow 2.10.1, use the legacy optimizer
        self._generator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)
        self._discriminator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)

        self._trainer_config = {
            'generator': self._generator,
            'discriminator': self._discriminator,
            'generator_optimizer': self._generator_optimizer,
            'discriminator_optimizer': self._discriminator_optimizer,
            'checkpoint_dir': self.checkpoint_dir,
            'log_dir': log_dir
        }

        trainer = CramerGANTrainer(**self._trainer_config)
        # Restore pretrained model
        trainer.restore_last()

    def str(self):
        return f"{self.particle}_{self.dropout_rate}"

    def get_generator(self) -> VirtualEnsembleModel:
        return self._generator

In [None]:
mc_model = MonteCarloDroupoutModel('proton', .01, debug=True)
gen1 = mc_model.get_generator()
gen1.single_model_inference_mode()
gen1.summary()

Generating model for proton with a dropout rate of 0.01
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           0 

In [None]:
def create_generator(dropout_rate, dropout_type='bernoulli'):
    gen_config = {
        'drop_rate': dropout_rate,
        'dropout_type': dropout_type,
    }
    generator = RichMCDropFunc(**gen_config)
    generator.build((None, 3))
    return generator

In [None]:
# Solution to read embeddings of any layer:

# 1. Create a new model with the same architecture
new_gen = create_generator(.01)
new_gen.set_weights(gen1.get_weights())

# 2. Create a new model that outputs from the layer of interest
input_layer = new_gen.input
output_layer = new_gen.layers[14].output  # Index is 14 because layer indexing starts from 0

# 3. Create a `new_model` without optimizations
new_model = Model(input_layer, [output_layer, new_gen.output])

# 4. Use `new_model` to predict on any input vector and get the embeddings
input_data = np.random.rand(1, 3)
embedding, prediction = new_model.predict(input_data)

print('Layer 4 embedding:', embedding)
print('Final prediction:', prediction)

Layer 0
Layer 1
Layer 2
Layer 3
Layer 4
Layer 4 embedding: [[-0.06809298 -0.18732685  0.09716857 -0.11373783  0.00190123  0.290569
  -0.18462771 -0.0034719   0.11210608  0.11586802 -0.04847524 -0.03848957
   0.19875628  0.15195915 -0.28900376 -0.03553712 -0.05702229  0.13993752
   0.08076362  0.08133171  0.1166122  -0.3413875  -0.06982535  0.236406
  -0.03750167  0.12933367 -0.1591197   0.15464732  0.10413228  0.15769379
  -0.10908235 -0.10817184  0.14233832 -0.10946845 -0.31025255  0.04373488
  -0.14985712 -0.088806    0.24635313 -0.11679609  0.20422535  0.10386544
  -0.05323229  0.1349889  -0.15639372 -0.234209    0.07865154 -0.22183321
   0.09174708  0.06153359 -0.03441417  0.1776678  -0.05372012  0.0345464
   0.17155069 -0.08633559 -0.40486044 -0.24406838  0.23207858 -0.01874956
  -0.133167   -0.22324546  0.1348783  -0.06088768  0.23631108  0.39888772
   0.32739127 -0.09040491  0.00548531  0.2437619  -0.26109856 -0.00204775
   0.13505948  0.12771133 -0.3024594  -0.20442452  0.12383

In [None]:
dir = 'drive/MyDrive/Colab Notebooks/outputs/proton/'

dataset = datasets['proton']

train_feats = dataset['feats_train']
train_targets = dataset['targets_train']
test_feats = dataset['feats_val']
test_targets = dataset['targets_val']

print('Train features shape:', train_feats.shape)
print('Train targets shape:', train_targets.shape)
print('Test features shape:', test_feats.shape)
print('Test targets shape:', test_targets.shape)

np.save(dir + 'train_feats.npy', train_feats)
np.save(dir + 'train_targets.npy', train_targets)
np.save(dir + 'test_feats.npy', test_feats)
np.save(dir + 'test_targets.npy', test_targets)

Train features shape: (454690, 3)
Train targets shape: (454690, 5)
Test features shape: (272463, 3)
Test targets shape: (272463, 5)


In [None]:
train_embeddings, train_predictions = new_model.predict(train_feats)
test_embeddings, test_predictions = new_model.predict(test_feats)

np.save(dir + 'train_embeddings.npy', train_embeddings)
np.save(dir + 'train_predictions.npy', train_predictions)
np.save(dir + 'test_embeddings.npy', test_embeddings)
np.save(dir + 'test_predictions.npy', test_predictions)

