In [None]:
import numpy as np
import h5py

#Choose where to load the files from
b_h5 = '/eos/cms/store/user/fsiroky/hdf5_data/'
# b_h5 = '/mnt/hdf5test/'
# b_h5   = '/home/test_local/'

pds  = {1: 'BTagCSV', 2: 'BTagMu', 3: 'Charmonium', 4:'DisplacedJet', 5: 'DoubleEG',
        6: 'DoubleMuon', 7: 'DoubleMuonLowMass',
       # 8: 'FSQJets', 9: 'HighMultiplicityEOF', #NOT ENOUGH DATA, NOTEBOOK FAILES
        10: 'HTMHT', 11: 'JetHT', 12: 'MET',
       # 13: 'MinimumBias', #NOT ENOUGH DATA
        14: 'MuonEG', 15: 'MuOnia',
       # 16: 'NoBPTX',
        17: 'SingleElectron', 18: 'SingleMuon', 19: 'SinglePhoton', 20: 'Tau', 21: 'ZeroBias'
}

#Choose which PD to load
nbr = 11

bg_files  = [b_h5+pds[nbr]+'_C_background.h5',b_h5+pds[nbr]+'_D_background.h5', b_h5+pds[nbr]+'_E_background.h5',
             b_h5+pds[nbr]+'_F_background.h5', b_h5+pds[nbr]+'_G_background.h5', b_h5+pds[nbr]+'_H_background.h5']

bg_jets   = [pds[nbr]+"_C_background", pds[nbr]+"_D_background", pds[nbr]+"_E_background",
             pds[nbr]+"_F_background", pds[nbr]+"_G_background", pds[nbr]+"_H_background"]

sig_files = [b_h5+pds[nbr]+'_C_signal.h5',b_h5+pds[nbr]+'_D_signal.h5', b_h5+pds[nbr]+'_E_signal.h5',
             b_h5+pds[nbr]+'_F_signal.h5', b_h5+pds[nbr]+'_G_signal.h5', b_h5+pds[nbr]+'_H_signal.h5']

sig_jets  = [pds[nbr]+"_C_signal", pds[nbr]+"_D_signal", pds[nbr]+"_E_signal",
             pds[nbr]+"_F_signal", pds[nbr]+"_G_signal", pds[nbr]+"_H_signal"]

      
def get_jets(bg_files, bg_jets, sig_files, sig_jets):
    good_jets = np.empty([0,2802])
    bad_jets  = np.empty([0,2802])
                   # Control which time intervals files per PD to load
    for i in range(0,len(bg_files)-4):
        try:
            bg_jetfile  = h5py.File(bg_files[i],'r')
            bg_jet      = bg_jetfile[bg_jets[i]][:]
            sig_jetfile = h5py.File(sig_files[i],'r')
            sig_jet     = sig_jetfile[sig_jets[i]][:]

            bad_jets    = np.concatenate((bad_jets, bg_jet), axis=0)
            good_jets = np.concatenate((good_jets, sig_jet), axis=0)
            print( "Number of good lumis: ", len(sig_jet), " Number of bad lumis: ", len(bg_jet)) 

        except OSError:
            print("This Primary Dataset doesn't have ", bg_jets[i])
    return good_jets, bad_jets

RANDOM_SEED = 42

In [None]:
#Load good and bad jets
good_jets, bad_jets = get_jets(bg_files, bg_jets, sig_files, sig_jets)

In [None]:
# import setGPU  
import pandas as pd
from sklearn.utils import shuffle

#Assign good jets class label 0
df1 = pd.DataFrame(good_jets)
# cutted_df = df1.iloc[0:1000, :]   #Temporarily to make training faster
# df1 = cutted_df                   #Temporarily to make training faster
df1['class'] = 0

#Assign bad_jets class label  1
df2 = pd.DataFrame(bad_jets)
# cutted_df = df2.iloc[0:30, :]    #Temporarily to make training faster
# df2 = cutted_df                   #Temporarily to make training faster
df2['class'] = 1

del(good_jets)
del(bad_jets)
#Concatenate them
frames = [df1,df2]
data   = pd.concat(frames)

del(df1)
del(df2)
#Shuffle them randomly
data = shuffle(data)
data = data.reset_index(drop=True)

#Save labels and delete them from df not to cheat during training
labels = data['class'].astype(int)
del data['class']

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#Normalize the data to make training better
standard_scaler = preprocessing.StandardScaler()
np_scaled = standard_scaler.fit_transform(data)
data_n = pd.DataFrame(np_scaled)

# #Make training and dev set
# X_train, X_dev = train_test_split(data_n, test_size=0.05, random_state=RANDOM_SEED)

# X_train = X_train.values
# X_dev   = X_dev.values

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint
import my_metrics

#Reduce the dimensionality of data_n
encoding_dim = 100
input = Input(shape=(2802,))
encoded = Dense(encoding_dim, activation='relu')(input)
decoded = Dense(2802, activation='sigmoid')(encoded)
autoencoder = Model(inputs=input, outputs=decoded)

encoder = Model(inputs=input, outputs=encoded)

encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(inputs=encoded_input, outputs=decoder_layer(encoded_input))

#Use command "tensorboard --logdir Graph" to launch TensorBoard
# tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

# checkpointer = ModelCheckpoint(filepath="model_tidyAAosvm.h5",
#                                verbose=0,
#                                save_best_only=True)

tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

autoencoder.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy',
                             my_metrics.mcor]
                   )


history = autoencoder.fit(data_n.values, data_n.values,
                epochs=2,
                batch_size=1000,
#                 validation_data=(X_dev, X_dev),
                shuffle=True,
                verbose=1,
                callbacks=[#checkpointer,
                           tensorboard]).history    



In [None]:
# # Uncomment to save the trained model (lower-dimensional representation of good_jets and bad_jets)

autoencoder.save("model_tidyAAosvm.h5")

# autoencoder.save_weights('my_model_weights_jetC_50_300ep.h5')

from keras.models import load_model

autoencoder = load_model("model_tidyAAosvm.h5")

In [None]:
import matplotlib.pyplot as plt
import matplotlib

f = plt.figure()
plt.plot(history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')

In [None]:
#Store the encoded matrix in "encoded". 
encoded = encoder.predict(data_n.values)
# decoded = decoder.predict(encoded)   
print(encoded.shape)
new_enc = pd.DataFrame(encoded)

In [None]:
#Normalize data again for OneClassSVM this time
# encoded = pd.DataFrame(encoded)
# encoded_scaled= standard_scaler.fit_transform(encoded)
# new_enc = pd.DataFrame(encoded_scaled)

In [None]:
#Add class labels back and split 'new_enc' to sigVals and backVals, transform them back to non-pd-dataframe
new_enc["class"] = labels

sigVals  = new_enc[new_enc['class'] == 0]
backVals = new_enc[new_enc['class'] == 1] 

In [None]:
from sklearn.model_selection import train_test_split

#Split sigVals into sigValsTrain and sigValsTest so that OneClassSVM can train on good only
sigValsTrain, sigValsTest = train_test_split(sigVals, test_size = 0.2, random_state = 42)

sigValsTrain = pd.DataFrame(sigValsTrain)
del sigValsTrain['class']

#Save labels for test signal and backvals for the roc curve
sigValsTest = pd.DataFrame(sigValsTest)
labels_sigValsTest = sigValsTest['class'].astype(int)
del sigValsTest['class']

backVals = pd.DataFrame(backVals)
labels_backVals = backVals['class'].astype(int)
del backVals['class']

In [None]:
print(labels_sigValsTest.shape)
print(labels_backVals.shape)
kk = np.append(labels_sigValsTest,labels_backVals)
print(kk.shape)

In [None]:
%%time
#Train OneClassSVM model on good only.
from sklearn.svm import *

nuVal    = 0.01
gammaVal = "auto"
clf  = OneClassSVM(nu=nuVal, kernel = 'rbf', gamma = gammaVal)
clf.fit(sigValsTrain.values)

In [None]:
#Predict based on "clf.fit(sigValsTrain)"
y_pred_train = clf.predict(sigValsTrain)
y_pred_test = clf.predict(sigValsTest)
y_pred_outliers = clf.predict(backVals)

falseNegTrain = y_pred_train[y_pred_train == -1].size
falseNegTest = y_pred_test[y_pred_test == -1].size
falsePos = y_pred_outliers[y_pred_outliers == 1].size
truePosTrain = y_pred_train[y_pred_train == 1].size
truePosTest = y_pred_test[y_pred_test == 1].size

In [None]:
#Plots classification results for signal and background
osvmArrs = []
osvmHists = []

#Separates decision function results into signal and background
#along with training and testing
osvmArrs.append(clf.decision_function(sigValsTrain).ravel())
osvmArrs.append(clf.decision_function(sigValsTest).ravel())
osvmArrs.append(clf.decision_function(backVals).ravel())

In [None]:

#%jsroot on9
%matplotlib inline
matplotlib.use('Agg')

#Sets up plot boundaries
plotMin = min(min(osvmArrs[0]), min(osvmArrs[1]), min(osvmArrs[2]))
plotMax = max(max(osvmArrs[0]), max(osvmArrs[1]), max(osvmArrs[2]))
binz = np.linspace(plotMin, plotMax, 200)

#Creates first histogram of Un-normalized Classification
plt.figure(figsize=(7, 10))
plt.subplot(211)
plt.hist(osvmArrs[0], normed = False, bins = binz, edgecolor = 'red',   
         facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = False, bins = binz, edgecolor = 'green', 
         facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = False, bins = binz, edgecolor = 'blue',  
         facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Un-normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend(loc = "upper left")

#Creates second histogram of Normalized Classification
plt.subplot(212)
plt.hist(osvmArrs[0], normed = True, bins = binz, edgecolor = 'red',   
         facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = True, bins = binz, edgecolor = 'green', 
         facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = True, bins = binz, edgecolor = 'blue',  
         facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend()

#Prints relevant statistics below
print("Loss Rate                                           : ", (falseNegTest/(truePosTest+falseNegTest)*100))
print("Pollution Rate                                      : ", (falsePos/(truePosTest+falsePos))*100)
print("Number of errors on training set : ", falseNegTrain, " Percentage: ", (falseNegTrain/len(sigValsTrain)*100))
print("Number of errors on test set     : ", falseNegTest, " Percentage: ", (falseNegTest/len(sigValsTest)*100))
print("Number of errors on outliers set : ", falsePos, "  Percentage: ", (falsePos/len(backVals)*100))

In [None]:
#Creates ROC curve
from sklearn.metrics import roc_curve, auc
print(labels.shape)
# yTest = np.append(sigTargetTest, backTarget)
yTest = kk
# print(labels)
# print(yTest)
osvmScore = np.append(osvmArrs[1],osvmArrs[2])
# osvmScore = np.append(y_pred_test , y_pred_outliers )
# print(osvmScore)
fpr, tpr, _ = roc_curve(kk, osvmScore)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2;
plt.plot(fpr, tpr, color='darkorange',
        lw = lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw = lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC OSVM, Tuned, Nu = %s, Gamma = %s' % (nuVal,gammaVal))
plt.legend(loc="lower right")
plt.show()

In [None]:
dist = np.zeros(len(data_n.values))
for i, x in enumerate(data_n.values):
    dist[i] = np.linalg.norm(x-decoded[i])
    
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(labels, dist)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10,6))
plt.plot(fpr, tpr, color='red', label='AUC = %0.2f)' % roc_auc)
plt.xlim((0,1))
plt.ylim((0,1))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive rate')
plt.ylabel('True Positive rate')
plt.title('ROC Autoencoder 100-80-100 ReLU/Sigmoid synth\_multidim\_100\_000')
plt.legend(loc="lower right")
plt.show()