<a href="https://colab.research.google.com/github/evillag/uncertainty_gan/blob/main/CERN_UE_Output_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import tensorflow as tf
from keras.models import Model

In [None]:
!git clone https://gitlab.com/lambda-hse/lhcb-rich-gan-uncertainty.git
!mv lhcb-rich-gan-uncertainty/experiments .
!mv lhcb-rich-gan-uncertainty/src .
!rm -r lhcb-rich-gan-uncertainty/
!rm -r sample_data/
!pip install tensorflow-addons

In [12]:
!pip install tensorflow-addon





In [13]:
from src.cramer_gan_trainer import CramerGANTrainer
from src.datasets.utils_rich import (get_merged_typed_dataset,
                                     parse_dataset_np)
from src.models.gans.discriminators.fcn_disc import RICHDiscriminator
from src.models.gans.generators.fcn_gen import RichMCDropFunc, VirtualEnsembleModel

In [None]:
# Dataset download and extraction
!unzip -qq drive/MyDrive/cern/data/rich.zip

In [None]:
# Model checkpoint download and extraction
!unzip -qq drive/MyDrive/cern/data/checkpoints_dropout_0.01.zip

In [14]:
PARTICLES = ['proton'] # ["pion", "kaon", "muon", "proton"]

DATA_DIR = 'rich'
CHECKPOINT_BASE = 'checkpoints'



def get_checkpoint_name(particle):
    return f'bernoulli_structured_dropout_line_test_cramer_drop_rate_0.01_{particle}'

In [15]:
def _split_by_line(df, slope=1, intercept=0):
    top_half = df[df['Brunel_ETA'] > df['Brunel_P'] * slope + intercept]
    bottom_half = df[df['Brunel_ETA'] <= df['Brunel_P'] * slope + intercept]

    top_half = top_half.reset_index(drop=True)
    bottom_half = bottom_half.reset_index(drop=True)

    return top_half, bottom_half


def split_by_line(df_train, df_test):
    return _split_by_line(df_train)[0], _split_by_line(df_test)[1]


def load_particle_datasets(particle, data_dir=DATA_DIR):
    """ The returned dictionary has this format:
        {
          "<particle_name>": {
            'data_train': data_train,
            'data_val': data_val,
            'scaler': scaler,
            'feats_train': feats_train,
            'targets_train': targets_train,
            'feats_val': feats_val,
            'targets_val': targets_val
          }
        }
    """
    data_train, data_val, scaler = get_merged_typed_dataset(data_dir, particle, dtype=np.float32, log=True,
                                                            sample_fn=split_by_line)
    feats_train, targets_train, _ = parse_dataset_np(data_train)
    feats_val, targets_val, _ = parse_dataset_np(data_val)

    print(f'feats_train shape\t{feats_train.shape}\n'
          f'targets_train shape\t{targets_train.shape}\n'
          f'feats_val shape  \t{feats_val.shape}\n'
          f'targets_val shape\t{targets_val.shape}\n')

    return {
        'data_train': data_train,
        'data_val': data_val,
        'scaler': scaler,
        'feats_train': feats_train,
        'targets_train': targets_train,
        'feats_val': feats_val,
        'targets_val': targets_val
    }

datasets = {particle: load_particle_datasets(particle) for particle in PARTICLES}

Reading and concatenating datasets:
	../data/rich\proton_+_down_2016_.csv
	../data/rich\proton_+_up_2016_.csv
	../data/rich\proton_-_down_2016_.csv
	../data/rich\proton_-_up_2016_.csv
splitting to train/val/test
fitting the scaler
scaler train sample size: 1000000
scaler n_quantiles: 100000, time = 0.9000041484832764
scaling train set
scaling test set
converting dtype to <class 'numpy.float32'>
feats_train shape	(454724, 3)
targets_train shape	(454724, 5)
feats_val shape  	(272832, 3)
targets_val shape	(272832, 5)



In [16]:
class MonteCarloDroupoutModel:
    def __init__(self, particle, dropout_rate,
                 log_dir='log_dir_tmp',
                 checkpoint_dir=CHECKPOINT_BASE,
                 debug=False):

        self.particle = particle
        self.dropout_rate = dropout_rate
        self.log_dir = log_dir
        self.checkpoint_dir = checkpoint_dir

        print(f'Generating model for {particle} with a dropout rate of {dropout_rate}')

        self._gen_config = {
            'drop_rate': dropout_rate,
            'dropout_type': 'bernoulli',
        }

        self._generator = RichMCDropFunc(**self._gen_config)
        self._generator.build((None, 3))
        self._discriminator = RICHDiscriminator()

        if debug:
            print("\nGenerator:\n")
            print(self._generator.summary(line_length=96))
            print("\nDiscriminator:\n")
            print(self._discriminator.summary())
            print(f"\nCheckpoint path: {self.checkpoint_dir}\n")

        # Model was trained with tensorflow 2.10.1, use the legacy optimizer
        self._generator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)
        self._discriminator_optimizer = tf.keras.optimizers.legacy.RMSprop(2e-4)

        self._trainer_config = {
            'generator': self._generator,
            'discriminator': self._discriminator,
            'generator_optimizer': self._generator_optimizer,
            'discriminator_optimizer': self._discriminator_optimizer,
            'checkpoint_dir': self.checkpoint_dir,
            'log_dir': log_dir
        }

        trainer = CramerGANTrainer(**self._trainer_config)
        # Restore pretrained model
        trainer.restore_last()

    def str(self):
        return f"{self.particle}_{self.dropout_rate}"

    def get_generator(self) -> VirtualEnsembleModel:
        return self._generator

In [17]:
mc_model = MonteCarloDroupoutModel('proton', .01, debug=True)
gen1 = mc_model.get_generator()
gen1.single_model_inference_mode()
gen1.summary()

Generating model for proton with a dropout rate of 0.01
Layer 0
Layer 1
Layer 2
Layer 3
Layer 4

Generator:

Model: "virtual_ensemble_model_1"
________________________________________________________________________________________________
 Layer (type)                              Output Shape                          Param #        
 Inputs (InputLayer)                       [(None, 3)]                           0              
                                                                                                
 NoiseInjection (NoiseInjection)           (None, 67)                            0              
                                                                                                
 Layer_0/Dense (Dense)                     (None, 128)                           8704           
                                                                                                
 Layer_0/LeakyReLU (LeakyReLU)             (None, 128)                           

In [18]:
def create_generator(dropout_rate, dropout_type='bernoulli'):
    gen_config = {
        'drop_rate': dropout_rate,
        'dropout_type': dropout_type,
    }
    generator = RichMCDropFunc(**gen_config)
    generator.build((None, 3))
    generator.single_model_inference_mode()
    return generator

In [19]:
# Solution to read embeddings of any layer:

# 1. Create a new model with the same architecture
new_gen = create_generator(.01)
new_gen.set_weights(gen1.get_weights())

# 2. Create a new model that outputs from the layer of interest
input_layer = new_gen.input
output_layer = new_gen.layers[14].output  # Index is 14 because layer indexing starts from 0

# 3. Create a `new_model` without optimizations
new_model = Model(input_layer, [output_layer, new_gen.output])
print(new_model.summary())

Layer 0
Layer 1
Layer 2
Layer 3
Layer 4
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Inputs (InputLayer)         [(None, 3)]               0         
                                                                 
 NoiseInjection (NoiseInject  (None, 67)               0         
 ion)                                                            
                                                                 
 Layer_0/Dense (Dense)       (None, 128)               8704      
                                                                 
 Layer_0/LeakyReLU (LeakyReL  (None, 128)              0         
 U)                                                              
                                                                 
 Layer_0/DropoutTrain (Dropo  (None, 128)              0         
 utTrain)                                                        
                   

In [20]:
# 4. Use `new_model` to predict on any input vector and get the embeddings
input_data = np.random.rand(1, 3)
embedding, prediction = new_model.predict(input_data)

original_model_prediction = gen1.predict(input_data)

print(f'Layer 4 {embedding.shape} embedding:\n{embedding}')
print('Final prediction:', prediction)
print('Final prediction:', original_model_prediction)

Layer 4 (1, 128) embedding:
[[ 0.12764037 -0.01327349  0.17507285 -0.344127    0.08596199  0.20315282
   0.0594095  -0.03482367  0.19064844  0.29219687 -0.06622943 -0.08309644
   0.08323199 -0.04771329 -0.07798733 -0.14236856  0.09624968  0.11993797
   0.02152829  0.00098584  0.12419684 -0.11989795  0.03352994  0.14096555
   0.08627124  0.18926719 -0.00098421  0.10983233  0.16482729  0.14274658
  -0.12298921  0.0275607   0.16230041 -0.13765405  0.03598827  0.06871919
   0.2860038  -0.07886286 -0.00436414 -0.01309698 -0.38272    -0.03287196
   0.19416624 -0.00488659 -0.07617358  0.2607302  -0.10444013 -0.03424028
  -0.15424722  0.14431304  0.06090955  0.03464188  0.14550951  0.06849828
   0.08043085  0.29488727  0.08121265 -0.13358432  0.07076782 -0.14085089
   0.00258403  0.02730661  0.13409364 -0.00519869  0.13262329 -0.00497463
  -0.31768432 -0.11523663  0.3876856   0.21075398 -0.05978448 -0.09931962
  -0.43261117 -0.09011538  0.09897996  0.21876644 -0.03022487  0.17012027
   0.07865

In [None]:
dir = 'drive/MyDrive/Colab Notebooks/outputs/proton/'

dataset = datasets['proton']

train_feats = dataset['feats_train']
train_targets = dataset['targets_train']
test_feats = dataset['feats_val']
test_targets = dataset['targets_val']

print('Train features shape:', train_feats.shape)
print('Train targets shape:', train_targets.shape)
print('Test features shape:', test_feats.shape)
print('Test targets shape:', test_targets.shape)

np.save(dir + 'train_feats.npy', train_feats)
np.save(dir + 'train_targets.npy', train_targets)
np.save(dir + 'test_feats.npy', test_feats)
np.save(dir + 'test_targets.npy', test_targets)

In [None]:
train_embeddings, train_predictions = new_model.predict(train_feats)
test_embeddings, test_predictions = new_model.predict(test_feats)

np.save(dir + 'train_embeddings.npy', train_embeddings)
np.save(dir + 'train_predictions.npy', train_predictions)
np.save(dir + 'test_embeddings.npy', test_embeddings)
np.save(dir + 'test_predictions.npy', test_predictions)

_____________________________________
tests

In [None]:
from src.datasets.utils_rich import dll_columns

num_iterations = 300
preds = []
for i in range(num_iterations):
    _, prediction = new_model.predict(input_data)
    preds.append(prediction)

print('Par\t'+'\t'.join(dll_columns))
print('Min\t'+'\t'.join([str(num) for num in np.min(preds, axis=0).squeeze()]))
print(f'Max\t'+'\t'.join([str(num) for num in np.max(preds, axis=0).squeeze()]))
print(f'Std\t'+'\t'.join([str(num) for num in np.std(preds, axis=0).squeeze()]))
print(f'Mea\t'+'\t'.join([str(num) for num in np.mean(preds, axis=0).squeeze()]))
