<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/CERN_UE_Output_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

In [None]:
!git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
!mv lhcb-rich-gan-uncertainty/experiments .
!mv lhcb-rich-gan-uncertainty/src .
!rm -r lhcb-rich-gan-uncertainty/
!rm -r sample_data/
!pip install tensorflow-addons

Cloning into 'lhcb-rich-gan-uncertainty'...
remote: Enumerating objects: 210, done.[K
remote: Total 210 (delta 0), reused 0 (delta 0), pack-reused 210 (from 1)[K
Receiving objects: 100% (210/210), 2.94 MiB | 14.55 MiB/s, done.
Resolving deltas: 100% (94/94), done.
mv: cannot move 'lhcb-rich-gan-uncertainty/experiments' to './experiments': Directory not empty
mv: cannot move 'lhcb-rich-gan-uncertainty/src' to './src': Directory not empty
rm: cannot remove 'sample_data/': No such file or directory


In [None]:
from experiments.efficiency.uncertainty_model_train import train_model
from experiments.efficiency.uncertainty_models import uncertainty_mlp
from experiments.efficiency.uncertainty_utils import (
    efficiency_bands_with_uncertainty, efficiency_momentum_with_uncertainty)
from experiments.efficiency.utils import (
    efficiency_bands, efficiency_momentum, ensemble_and_ref_model_inference,
    ensemble_and_ref_model_inference_on_bands, tf_to_numpy_dataset,
    threshold_selection)
from src.cramer_gan_trainer import CramerGANTrainer
from src.dataset import CramerGANDataset
from src.datasets.utils_rich import (get_merged_typed_dataset,
                                     parse_dataset_np, parse_example)
from src.models.gans.discriminators.fcn_disc import RICHDiscriminator
from src.models.gans.generators.fcn_gen import RichMCDropFunc, VirtualEnsembleModel


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
PARTICLES = ['pion'] # ["pion", 'kaon', "muon", "proton"]
DROPOUTS = [0.25, 0.3, 0.35, 0.4]
ENSEMBLES = [16, 32, 64, 128, 256]
NUM_REPS = 10
SUB_SAMPLE_SIZE = .3
THRESHOLD = 1.0

DATA_DIR = '/content/drive/MyDrive/cern/data/rich'
CHECKPOINT_BASE = '/content/drive/MyDrive/cern/checkpoints/'
CKPT_NUMBER = 'ckpt-21'


def get_checkpoint_name(particle):
    return f'bernoulli_structured_dropout_line_test_cramer_weighted_{particle}'

In [None]:
def _split_by_line(df, slope=1, intercept=0):
    top_half = df[df['Brunel_ETA'] > df['Brunel_P'] * slope + intercept]
    bottom_half = df[df['Brunel_ETA'] <= df['Brunel_P'] * slope + intercept]

    top_half = top_half.reset_index(drop=True)
    bottom_half = bottom_half.reset_index(drop=True)

    return top_half, bottom_half


def split_by_line(df_train, df_test):
    return _split_by_line(df_train)[0], _split_by_line(df_test)[1]


def load_particle_datasets(particle, data_dir=DATA_DIR):
    """ The returned dictionary has this format:
        {
          "<particle_name>": {
            'data_train': data_train,
            'data_val': data_val,
            'scaler': scaler,
            'feats_train': feats_train,
            'targets_train': targets_train,
            'feats_val': feats_val,
            'targets_val': targets_val
          }
        }
    """
    data_train, data_val, scaler = get_merged_typed_dataset(data_dir, particle, dtype=np.float32, log=True,
                                                            sample_fn=split_by_line)
    feats_train, targets_train, _ = parse_dataset_np(data_train)
    feats_val, targets_val, _ = parse_dataset_np(data_val)

    print(f'feats_train shape\t{feats_train.shape}\n'
          f'targets_train shape\t{targets_train.shape}\n'
          f'feats_val shape  \t{feats_val.shape}\n'
          f'targets_val shape\t{targets_val.shape}\n')

    return {
        'data_train': data_train,
        'data_val': data_val,
        'scaler': scaler,
        'feats_train': feats_train,
        'targets_train': targets_train,
        'feats_val': feats_val,
        'targets_val': targets_val
    }

datasets = {particle: load_particle_datasets(particle) for particle in PARTICLES}

Reading and concatenating datasets:
	/content/drive/MyDrive/cern/data/rich/pion_-_down_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion_+_down_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion_-_up_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion_+_up_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion2_-_down_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion2_+_down_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion2_-_up_2016_.csv
	/content/drive/MyDrive/cern/data/rich/pion2_+_up_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 2000000
scaler n_quantiles: 100000, time = 2.965564250946045
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(947947, 3)
targets_train shape	(947947, 5)
feats_val shape  	(524521, 3)
targets_val shape	(524521, 5)



In [None]:
class MonteCarloDroupoutModel:
    def __init__(self, particle, dropout_rate,
                 log_dir='log_dir_tmp',
                 checkpoint_base=CHECKPOINT_BASE,
                 chekpoint_file=CKPT_NUMBER,
                 debug=False):
        self.particle = particle
        self.dropout_rate = dropout_rate
        self.log_dir = log_dir

        print(f'Generating model for {particle} with a dropout rate of {dropout_rate}')

        self._gen_config = {
            'drop_rate': dropout_rate,
            'dropout_type': 'bernoulli',
        }

        self._generator = RichMCDropFunc(**self._gen_config)
        self._generator.build((None, 3))
        self._discriminator = RICHDiscriminator()

        self._checkpoint_dir = os.path.join(checkpoint_base, get_checkpoint_name(self.particle))
        self._filename = os.path.join(self._checkpoint_dir, chekpoint_file)

        if debug:
            print("\nGenerator:\n")
            print(self._generator.summary(line_length=96))
            print("\nDiscriminator:\n")
            print(self._discriminator.summary())
            print(f"\nCheckpoint filename: {self._filename}\n")

        # Model was trained with tensorflow 2.10.1, use the legacy optimizer
        self._generator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)
        self._discriminator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)

        self._trainer_config = {
            'generator': self._generator,
            'discriminator': self._discriminator,
            'generator_optimizer': self._generator_optimizer,
            'discriminator_optimizer': self._discriminator_optimizer,
            'checkpoint_dir': self._checkpoint_dir,
            'log_dir': log_dir
        }
        trainer = CramerGANTrainer(**self._trainer_config)
        # Restore pretrained model
        trainer.restore(self._filename)

    def __str__(self):
        return f"{self.particle}_{self.dropout_rate}"

    def get_generator(self) -> VirtualEnsembleModel:
        return self._generator

In [None]:
mc_model = MonteCarloDroupoutModel('pion', 0.1, debug=True)
gen1 = mc_model.get_generator()
gen1.single_model_inference_mode()
gen1.summary()

Generating model for pion with a dropout rate of 0.1
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           0    

In [None]:
def create_generator(dropout_rate, dropout_type='bernoulli'):
    gen_config = {
        'drop_rate': dropout_rate,
        'dropout_type': dropout_type,
    }
    generator = RichMCDropFunc(**gen_config)
    generator.build((None, 3))
    return generator

In [None]:
# Solution to read embeddings of any layer:

# 1. Create a new model with the same architecture
new_gen = create_generator(0.1)
new_gen.set_weights(gen1.get_weights())

# 2. Create a new model that outputs from the layer of interest
input_layer = new_gen.input
output_layer = new_gen.layers[14].output  # Index is 14 because layer indexing starts from 0

# 3. Create a `new_model` without optimizations
new_model = Model(input_layer, [output_layer, new_gen.output])

# 4. Use `new_model` to predict on any input vector and get the embeddings
input_data = np.random.rand(1, 3)
embedding, prediction = new_model.predict(input_data)

print('Layer 4 embedding:', embedding)
print('Final prediction:', prediction)

Layer 0
Layer 1
Layer 2
Layer 3
Layer 4
Layer 4 embedding: [[-0.13039863  0.07018615 -0.58620447  0.27404422 -0.5218892  -0.42251518
   0.1326365  -0.12117149 -0.6466348  -0.17054203 -0.03501824  0.40272358
  -0.3218141  -0.6264842   0.31098336  0.040048   -0.18954287  0.13452604
  -0.60610276  0.07645752 -0.49973622  0.18829209 -0.43715978 -0.03523372
  -0.61529243 -0.31356004 -0.1505895  -0.31181842  0.09629247 -0.24686044
  -0.5043816   0.06503329 -0.09505799 -0.5476164   0.04736098  0.4509455
  -0.56498504 -0.5798478  -0.540283    0.23396093  0.4231578   0.41542488
  -0.5008768   0.07033277  0.19642603  0.3640912   0.2949308   0.05807041
   0.14231616 -0.5481269   0.26094338 -0.37120897 -0.31159335  0.09640955
   0.4567487   0.48678613  0.3774986  -0.5941418   0.07282819 -0.14715923
  -0.21480635  0.6845972   0.10156496  0.72645295 -0.14876929 -0.23596102
   0.655008    0.26153165 -0.18069889  0.07549362  0.15985073 -0.4726729
   0.267219   -0.06900786  0.06351279  0.34687132  0.06

In [None]:
pion_test_feats = datasets['pion']['feats_val']
pion_test_targets = datasets['pion']['targets_val']

print('Pion test features shape:', pion_test_feats.shape)
print('Pion test targets shape:', pion_test_targets.shape)

Pion test features shape: (524521, 3)
Pion test targets shape: (524521, 5)


In [None]:
pion_embeddings, pion_predictions = new_model.predict(pion_test_feats)



In [None]:
dir = 'drive/MyDrive/Colab Notebooks/outputs/'

np.save(dir + 'pion_targets.npy', pion_test_targets)
np.save(dir + 'pion_embeddings.npy', pion_embeddings)
np.save(dir + 'pion_predictions.npy', pion_predictions)