In [1]:
# Running on GPU?
import setGPU

import getpass
import h5py
import os
import pickle
import matplotlib.pyplot as plt
import matplotlib

from tqdm import tqdm

setGPU: Setting GPU to: 2


In [None]:
# Get permission to access EOS (Insert your NICE password)
os.system("echo %s | kinit" % getpass.getpass())

········


0

In [None]:
import json
import numpy as np
import pandas as pd

import keras
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Input, Dense, Lambda, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.activations import sigmoid, linear, relu
from keras.models import Model, load_model
from keras.regularizers import l1, l2, l1_l2

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [None]:
PDs  = {1: 'BTagCSV',
        2: 'BTagMu',
        3: 'Charmonium',
        4: 'DisplacedJet',
        5: 'DoubleEG',
        6: 'DoubleMuon',
        7: 'DoubleMuonLowMass',
        8: 'FSQJets',
        9: 'HighMultiplicityEOF',
        10: 'HTMHT',
        11: 'JetHT',
        12: 'MET',
        13: 'MinimumBias',
        14: 'MuonEG',
        15: 'MuOnia',
        16: 'NoBPTX',
        17: 'SingleElectron',
        18: 'SingleMuon',
        19: 'SinglePhoton',
        20: 'Tau',
        21: 'ZeroBias'}

# Select PD
nPD = 11

In [None]:
data_directory = "/eos/cms/store/user/fsiroky/consistentlumih5/"
label_file = "/afs/cern.ch/user/t/tkrzyzek/Documents/Data-Certification/JetHT.json"
model_directory = "/eos/user/t/tkrzyzek/autoencoder/standard/"
model_name = "model"

In [None]:
def get_file_list(directory, pds, npd, typeof, extension):
    files = []
    parts = ["C", "D", "E", "F", "G", "H"]
    for p in parts:
        files.append("%s%s_%s_%s%s" % (directory, pds[npd], p, typeof, extension))
    return files

files = get_file_list(data_directory, PDs, nPD, "background", ".h5")
files = files + get_file_list(data_directory, PDs, nPD, "signal", ".h5")

In [None]:
# Load good and bad jets
def get_data(files):
    readout = np.empty([0,2813])
    
    for file in files:
        jet = file.split("/")[-1][:-3]
        print("Reading: %s" % jet)
        try:
            h5file = h5py.File(file, "r")
            readout = np.concatenate((readout, h5file[jet][:]), axis=0)
        except OSError as error:
            print("This Primary Dataset doesn't have %s. %s" % (jet, error))
            continue

    return readout

data = pd.DataFrame(get_data(files))

In [None]:
data["run"] = data[2807].astype(int)
data["lumi"] = data[2808].astype(int)
data["inst_lumi"] = data[2809].astype(float)

# Drop unnecessary meta data
data.drop([2807, 2808, 2809, 2810, 2811, 2812], axis=1, inplace=True)

# Sort by runID and then by lumiID
data = data.sort_values(["run", "lumi"], ascending=[True,True])

# Reset index
data = data.reset_index(drop=True)  

runIDs  = data["run"].astype(int)
lumiIDs = data["lumi"].astype(int)
luminosity = data["inst_lumi"].astype(float)

In [None]:
# Apply labels
output_json = json.load(open(label_file))

def json_checker(json_file, orig_runid, orig_lumid):
    try:
        for i in json_file[str(int(orig_runid))]:
            if orig_lumid >= i[0] and orig_lumid <= i[1]:
                return 0
    except KeyError:
        pass
    return 1

def add_flags(sample):
    return json_checker(output_json, sample["run"], sample["lumi"])

data["label"] = data.apply(add_flags, axis=1)

In [None]:
# Split the data
SPLIT_FACTOR = 0.1

split = round(SPLIT_FACTOR*len(data))

runIDs = runIDs[split:]
lumiIDs = lumiIDs[split:]
luminosity = luminosity[split:]

train = data.iloc[:split]
before = train.shape[0]
X_train = train.iloc[:, 0:2806]
y_train = train["label"]

test = data.iloc[split:]
X_test = test.iloc[:, 0:2806]
y_test = test["label"]

normalizer = StandardScaler()
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

# Train only on good
X_train = X_train[y_train == 0]
X_train_norm = X_train_norm[y_train == 0]

input_dim = X_train.shape[1]

In [None]:
input_layer = Input(shape=(input_dim, ))

x = Dense(2000, kernel_regularizer=l1_l2(10e-5))(input_layer)
x = PReLU()(x)

x = Dense(1000, kernel_regularizer=l1_l2(10e-5))(x)
x = PReLU()(x)

x = Dense(500, kernel_regularizer=l1_l2(10e-5))(x)
x = PReLU()(x)

x = Dense(1000, kernel_regularizer=l1_l2(10e-5))(x)
x = PReLU()(x)

x = Dense(2000, kernel_regularizer=l1_l2(10e-5))(x)
x = PReLU()(x)

x = Dense(input_dim)(x)
x = linear(x)

autoencoder = Model(inputs=input_layer, outputs=x)

autoencoder.summary()

In [None]:
adamm = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

early_stopper = EarlyStopping(monitor="val_loss",
                              patience=32,
                              verbose=True,
                              mode="auto")

autoencoder.compile(optimizer=adamm, loss='mean_squared_error')

checkpoint_callback = ModelCheckpoint(("%s%s.h5" % (model_directory, model_name)),
                                      monitor="val_loss",
                                      verbose=False,
                                      save_best_only=True,
                                      mode="min")

In [None]:
X_train = X_train.values
X_test = X_test.values

In [None]:
autoencoder.fit(X_train_norm,
                X_train_norm,
                epochs=128,
                batch_size=256,
                validation_split=0.25,
                verbose=2,
                callbacks=[early_stopper, checkpoint_callback])

In [None]:
# Reload saved model
autoencoder = load_model("%s%s.h5" % (model_directory, model_name))

In [None]:
# Run predictions
predictions = autoencoder.predict(X_test_norm)

def get_error_df(X_test, predictions, mode="allmean", n_highest = 100):
    
    if mode == "allmean":
        return np.mean(np.power(X_test - predictions, 2), axis=1)
    
    elif mode == "topn":
        temp = np.partition(-np.power(X_test - predictions, 2), n_highest)
        result = -temp[:,:n_highest]
        return np.mean(result, axis=1)
    
    elif mode == "perobj":
        mses = []
        for l in legend:
            mse = np.mean(
                np.power(X_test[:,l["start"]:l["end"]] - predictions[:,l["start"]:l["end"]], 2),
                axis=1)
            mses.append(mse)
     
        return np.maximum.reduce(mses)
    
ae_error = get_error_df(X_test_norm, predictions, mode="topn")

In [None]:
ae_pred_baseline = pickle.load(open('/eos/user/t/tkrzyzek/autoencoder/lumi_dep/split01/ae_pred.p', "rb"))
ae_error_baseline = get_error_df(X_test_norm, ae_pred_baseline, mode="topn")

In [None]:
# Make ROC_curve

from sklearn.metrics import auc, roc_curve, roc_auc_score                          

def get_roc_curve(label, scores, names):
    """Generates ROC Curves for a given array"""
    fig, ax = plt.subplots()
    
    for i in range(len(scores)):
        fpr, tpr, thresholds = roc_curve(label, scores[i])
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr,
                 tpr,
                 linewidth=3,
                 #linestyle=line_styles[0],
                 label=("%s AUC: %s" % (names[i], roc_auc)))
        
    plt.legend(frameon=False)
    plt.ylabel("Sensitivity (TPR)")
    plt.xlabel("Fall-out (TNR)")
    plt.ylim([0, 1])
    plt.xlim([0, 1])
    plt.title('ROC curves for AE with batchnorm')
    plt.show();

plt.rcParams['figure.figsize'] = [20, 10]
get_roc_curve(y_test, [ae_error, ae_error_baseline], ['AE with cleaned data', 'AE'])

In [None]:
# model_directory = "/afs/cern.ch/user/t/tkrzyzek/Documents/Data-Certification/temp/"

In [None]:
pickle.dump(ae_error, open(model_directory + "ae_error_cleaned_4e7.p", "wb"))

In [None]:
pickle.dump(predictions, open(model_directory + "pred_cleaned_4e7.p", "wb"))

In [None]:
def pred_vs_feature2(y_val, x_val, y_class, y_name="", x_name="", selected=[], linear=False,
                    x_lim=None, y_lim=None, title=""):

    fig, (ax1) = plt.subplots(1, 1, sharex=True)
    
    ax = [ax1]
    ax1.set_title(title)
    
    for i in range(1):
        df = pd.DataFrame({'y_val': y_val[i],
                           'x_val': x_val,
                           'y_class': y_class})

        groups = df.groupby('y_class')

        for name, group in groups:
            ax[i].plot(group.x_val, 
                    group.y_val,
                    color="r" if name == 1 else "g",
                    marker='o',
                    ms=2,
                    linestyle='',
                    label= "Bad" if name == 1 else "Good")

        for i in selected:
            ax[i].plot(x_val[i],
                    y_val[i],
                    color="g",
                    marker='o',
                    ms=4,
                    linestyle='')

        ax[i].legend()
        if not linear:
            ax[i].set_yscale('log')
        if x_lim:
            ax[i].set_xlim(x_lim[0], x_lim[1])
        if y_lim:
            ax[i].set_ylim(y_lim[0], y_lim[1])
        ax[i].set_ylabel(y_name[i])
        ax[i].grid()
    plt.xlabel(x_name)
   
    plt.show()

In [None]:
def plot_var(legend, ae_pred):
    for var in legend:
            print("###########################################################################")
            print(var['name'])
            print("###########################################################################")

            X_var = X.iloc[:, var['start'] : var['end']+1]
            ae_pred_var = ae_pred[:, var['start'] : var['end']+1]

            mean = np.mean(X_var, axis=1)
            mean_abs = np.mean(np.abs(X_var), axis=1)
            sd = np.std(X_var, axis=1)
            
            ae_mean = np.mean(ae_pred_var, axis=1)
            ae_mean_abs = np.mean(np.abs(ae_pred_var), axis=1)
            ae_sd = np.std(ae_pred_var, axis=1)

            # No scale set
            pred_vs_feature2([mean, mean_abs, sd],
                            luminosity, 
                            y_test, #!!!
                            [var['name'], 
                             var['name'] + " abs"],
                            "Luminosity",
                            title=var['name'],
                            x_lim=(0, 0.35),
#                             y_lim=(-5, 5),
                            linear=True)
            
            pred_vs_feature2([ae_mean, ae_mean_abs, ae_sd],
                            luminosity, 
                            y_test, #!!!
                            [var['name'], 
                             var['name'] + " AE pred"],
                            "Luminosity",
                            title=var['name'] + " AE pred",
                            x_lim=(0, 0.35),
#                             y_lim=(-5, 5),
                            linear=True)
            
            # With bounds [-5, 5]
            pred_vs_feature2([mean, mean_abs, sd],
                            luminosity, 
                            y_test, #!!!
                            [var['name'], 
                             var['name'] + " abs"],
                            "Luminosity",
                            title=var['name'],
                            x_lim=(0, 0.35),
                            y_lim=(-5, 5),
                            linear=True)
            
            pred_vs_feature2([ae_mean, ae_mean_abs, ae_sd],
                            luminosity, 
                            y_test, #!!!
                            [var['name'], 
                             var['name'] + " AE pred"],
                            "Luminosity",
                            title=var['name'] + " AE pred",
                            x_lim=(0, 0.35),
                            y_lim=(-5, 5),
                            linear=True)


In [None]:
# start_legend = 1918 #nvtx
start_legend = 1414

In [None]:
var_legend = [{'start': start_legend, 'end': start_legend, 'name': 'Mean'},
              {'start': start_legend+1, 'end': start_legend+1, 'name': 'RMS'},
              {'start': start_legend+2, 'end': start_legend+2, 'name': 'Q1'},
              {'start': start_legend+3, 'end': start_legend+3, 'name': 'Q2'},
              {'start': start_legend+4, 'end': start_legend+4, 'name': 'Q3'},
              {'start': start_legend+5, 'end': start_legend+5, 'name': 'Q4'},
              {'start': start_legend+6, 'end': start_legend+6, 'name': 'Q5'}]

In [None]:
X = X_test

In [None]:
matplotlib.rcParams["figure.figsize"] = (10, 10)
plot_var(var_legend, predictions)