In [2]:
#get the good data

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval
import json

df= pd.read_csv('PixelStudy/ZeroBias_2017UL_DataFrame_ChargeInnerLayer2.txt')#'GOLDEN_Tot_ZeroBias_UL2017_DataFrame_chargeInner_PXLayer_1.csv')

df['histo']=df['histo'].apply(literal_eval)
df=df.loc[df.entries>0].copy()

df.set_index(['fromrun','fromlumi'], inplace=True, drop=False)
df.sort_index(inplace=True)

print(df.tail())
print(df.fromrun.nunique())

                  Unnamed: 0  fromrun  fromlumi                  hname  \
fromrun fromlumi                                                         
306462  95             67823   306462        95  chargeInner_PXLayer_1   
        96             67824   306462        96  chargeInner_PXLayer_1   
        97             67825   306462        97  chargeInner_PXLayer_1   
        98             67826   306462        98  chargeInner_PXLayer_1   
        99             67827   306462        99  chargeInner_PXLayer_1   

                  entries     Xmax  Xmin  Xbins  metype  \
fromrun fromlumi                                          
306462  95              0  80000.0   0.0    100       3   
        96              0  80000.0   0.0    100       3   
        97              0  80000.0   0.0    100       3   
        98              0  80000.0   0.0    100       3   
        99              0  80000.0   0.0    100       3   

                                                              histo

In [None]:
def PlotHist(df,run,nolabels=True,fsave=False,hname="chargeInner_PXLayer_2", mode=1):

    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    halfbinwhidth=(Xmax-Xmin)/(2.0*Xbins)      
    
    x= np.linspace(Xmin+halfbinwhidth,Xmax-halfbinwhidth,Xbins)  
    
    if (mode==1): plt.xlim(Xmin,Xmax)
    
    plt.title(hname + " " + str(run)+" ("+ str((df['fromlumi'][run]).size) + ")")
    
    for ls in df['fromlumi'][run]:
        ahisto=np.array(df['histo'][run][ls]).astype('float')
        
        if (mode==1):
            if nolabels: plt.step(x, ahisto, where='mid', label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
            else: 
                if (df['labels'][run][ls]==True): plt.step(x, ahisto, where='mid', c="green",label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
                if (df['labels'][run][ls]==False): plt.step(x, ahisto, where='mid', c="red",label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
    
        if (mode==2):
            value=np.dot(ahisto,x)/ahisto.sum()
            if nolabels: plt.scatter(float(ls),value,color="blue",marker=".")
            else: 
                if (df['labels'][run][ls]==True): plt.scatter(float(ls),value,color="green",marker=".")
                if (df['labels'][run][ls]==False): plt.scatter(float(ls),value,color="red",marker=".")
    
    
    if (mode==1):
        plt.xlabel("Charge electrons")
        plt.ylabel("A.U")
        
    if (mode==2):
        plt.xlabel("Lumisection")
        plt.ylabel("Histo Average")
    
    
    if fsave:
        fname=""
        if (mode==1): fname=hname + "_"+ str(run)+".png"
        if (mode==2): fname=hname + "_"+ str(run)+"_trend.png"
        plt.savefig(fname)
        
    #if (legend): plt.legend()
    plt.show()
    #plt.savefig('chargeInner_PXLayer_1_GOOD.png')
    
PlotHist(df,301998, mode=2)

In [None]:
#add Golden JSON labels to the DF
import json

def checkLS(run,ls):
    isok=False
    global jsondata
    
    if str(run) in jsondata.keys():
        for i in jsondata[str(run)]:
            if (ls>=i[0] and ls <=i[1]):
                isok=True
                return isok
        return isok

#load the golden json file
jsondata={}
with open('GoldenJSON17.json') as json_file:
    jsondata = json.load(json_file)

df['labels']=False #initialize to false

for run in df['fromrun'].unique():
    for ls in df['fromlumi'][run]:
        df['labels'][run][ls]=checkLS(run,ls)

#print(df[df['labels']==True]) #to check against the Golden JSON

In [None]:
#make a JSON plot
for run in df['fromrun'].unique():
    PlotHist(df,run,False)

In [None]:
#use golden lumisections to train the model

X_trainS = np.stack(df['histo'][df['labels']==True].values, axis=0) #convert list of array to a stack to feed the model
X_testS = np.stack(df['histo'][df['labels']==False].values, axis=0)

#print(X_trainS)

from sklearn.preprocessing import normalize

X_train=normalize(X_trainS, norm='l1', axis=1) #normalise the sample, i.e the rows
X_test=normalize(X_testS, norm='l1', axis=1)
                      
df['origin']=''
df['origin'].loc[df['labels']==True]=X_train.tolist()
df['origin'].loc[df['labels']==False]=X_test.tolist()
print(df.shape)
print(df.origin.shape)

In [None]:
#build the simple model
import math
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Input, Dense
from keras.layers.advanced_activations import PReLU
from keras.models import Model, load_model
from keras import backend as K
import tensorflow as tf

def mseTop10(y_true, y_pred):
    top_values, _ = tf.nn.top_k(K.square(y_pred - y_true), k=10, sorted=True)
    mean=K.mean(top_values, axis=-1)
    return mean

input_size=len(X_train[0])

input_layerA = Input(shape=(input_size, ))

encodedA = Dense(20, activation='sigmoid')(input_layerA)
encoded1A = Dense(10, activation='tanh')(encodedA)
encoded2A = Dense(20, activation='sigmoid')(encoded1A)
decoderA = Dense(input_size, activation='sigmoid')(encoded2A)

autoencoder= Model(inputs=input_layerA, outputs=decoderA)
autoencoder.compile(optimizer='adam', loss=mseTop10)

for i, layer in enumerate(autoencoder.layers):
             layer.name = 'layer_' + str(i)

autoencoder.summary()

In [None]:
#train the model
history = autoencoder.fit(X_train,X_train, epochs=300, batch_size=100, shuffle=True, verbose=1,validation_split=0.1)

In [None]:
#Save and load model, for some reasons doesn't work on Swan
#autoencoder.save("Test_AE.h5")
#autoencoder=load_model("Test_AE.h5")

In [None]:
line_styles = [(0, ()), (0, (1, 3)), (0, (3, 2))]

color_palette = {"Indigo": {
                    50: "#E8EAF6",
                    100: "#C5CAE9",
                    200: "#9FA8DA",
                    300: "#7986CB",
                    400: "#5C6BC0",
                    500: "#3F51B5",
                    600: "#3949AB",
                    700: "#303F9F",
                    800: "#283593",
                    900: "#1A237E"},
                 "Teal": {      
                    50: "#E0F2F1",
                    100: "#B2DFDB",
                    200: "#80CBC4",
                    300: "#4DB6AC",
                    400: "#26A69A",
                    500: "#009688",
                    600: "#00897B",
                    700: "#00796B",
                    800: "#00695C",
                    900: "#004D40"}
                }

def plot_loss(data, title):
    """Plots the training and validation loss"""
    plt.figure()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(title)
    plt.plot(data.history['loss'], linestyle=line_styles[0], color=color_palette["Indigo"][900], linewidth=3)
    plt.plot(data.history['val_loss'], linestyle=line_styles[2], color=color_palette["Teal"][300], linewidth=3)
    plt.legend(["Train", "Validation"], loc="upper right", frameon=False)
    #plt.ylim(8258339,8258400)
    plt.yscale("log")
    plt.xlim(0,100)
    plt.show();

plot_loss(history, "Original model loss")

In [None]:
#add predictions and mse as new columns in the AE
predictionTest=autoencoder.predict(X_test)
predictionTrain=autoencoder.predict(X_train)

mseTest=K.eval(mseTop10(X_test, predictionTest))
mseTrain=K.eval(mseTop10(X_train, predictionTrain))

df['prediction']=""
df['mse']=""

df['prediction'].loc[df['labels']==True]=predictionTrain.tolist()
df['prediction'].loc[df['labels']==False]=predictionTest.tolist()

df['mse'].loc[df['labels']==True]=mseTrain
df['mse'].loc[df['labels']==False]=mseTest

print(df.head())

In [None]:
def globalMSETrend(df,type=1):
    
    y=[]
    
    rmax=0
    rmin=10
    
    print(rmax, rmin)
    for run in df['fromrun'].unique():
        if (type==1): val=(df['mse'][run]).mean()
        if (type==2): val=(df['mse2'][run]).mean()
        if val > rmax: rmax=val
        if val < rmin: rmin =val
        y.append(val)
    
    array=np.array(y)
    gmean=array.mean()
    size=float(array.size)
    gstd=array.std()
    print(size)
    
    fig=plt.figure()
    fig.set_size_inches(16, 9)
    
    plt.hlines(gmean,df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="blue", label="Run average: " + str(gmean))
    plt.hlines(gmean+(1.0*gstd), df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="red", label='1 SD (' + str(gstd) + ")")
    plt.hlines(gmean+(3.0*gstd), df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="red", label='3 SD', linestyle=':')
    
    plt.ylim(rmin*0.9,rmax*1.1)
    plt.scatter(df['fromrun'].unique(), y, marker='+', label='Data points')
    plt.xlabel("Run")
    plt.ylabel("average MSE")
    plt.yscale("log")
    plt.legend()
    plt.show()

In [None]:
globalMSETrend(df[df.labels==True]) # mse trend of train data

In [None]:
globalMSETrend(df[(df.labels==False) & (df.entries>0)]) # mse trend of test data

In [None]:
import matplotlib.gridspec as gridspec

def CheckPredictions(df_test, run, ls, type=1, save=False):

    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    hname="chargeInner_PXLayer_1"
    
    ahisto=np.array(df_test['origin'][run][ls])
    ahisto1=np.empty(ahisto.shape[0])
    if type==1: ahisto1=np.array(df_test['prediction'][run][ls])
    if type==2: ahisto1=np.array(df_test['prediction2'][run][ls]) 
        
    mse=[]
    if type==1:mse=df_test['mse'][run][ls]
    if type==2:mse=df_test['mse2'][run][ls]
        
    msebin=(ahisto-ahisto1)**2
    
    gs = gridspec.GridSpec(7,1)
    
    x= np.linspace(Xmin,Xmax,Xbins)
    xbin=np.linspace(0,Xbins,Xbins)
    fig= plt.figure()
    fig.set_size_inches(5,5)
    
    axs0=plt.subplot(gs[:4, :])
    axs1=plt.subplot(gs[5:, :])
    
    axs0.step(x, ahisto, where='mid', label=(" Data LS " + str(df_test.fromlumi[run][ls]) + " Run " + str(df_test.fromrun[run][ls]) ))
    axs0.step(x, ahisto1, where='mid', label=" Reco MSE=" + str(mse))
    axs1.step(xbin, msebin, where='mid')
    axs0.set(xlabel="Charge electrons",ylabel="A.U")
    axs1.set(ylabel="MSE", xlabel="bin")
    axs0.legend()
    if save: plt.savefig(hname + "_" + str(run) + "_LS" +str(ls) + ".png")
    plt.show()

In [None]:
for run in df['fromrun'].unique(): #check few orediction plots, max 2 per run
    count=0
    for ls in df['fromlumi'][run]: 
        if df['labels'][run][ls]== False: continue
        if (count >2): continue
        count=count+1
        CheckPredictions(df[df.labels==True], run,ls)

In [None]:
def mseHisto(df):
    g=np.array(df['mse'][df.labels==True])
    w=np.array(df['mse'][(df.labels==False) & (df.entries>0)])
    plt.hist(w,bins=100,range=(w.min()*0.8,(w.max()*1.2)), alpha=0.5, color="red", label="Test")
    plt.hist(g,bins=100,range=(w.min()*0.8,(w.max()*1.2)), alpha=0.5, color="green", label="Train")
    plt.yscale('log')
    plt.legend()
    plt.show()
    
mseHisto(df)

In [None]:
for run in df['fromrun'].unique(): #to check few plots based on the histograms produced above
    count=0
    for ls in df['fromlumi'][run]: 
        if df['labels'][run][ls]== True: continue
        if df['mse'][run][ls]< 0.015: continue
        #if (count >2): continue
        count=count+1
        CheckPredictions(df[df.labels==False], run,ls,2)

In [None]:
#Here you should add more cells to complete the study, i.e find a good threshold for the mse to separate anomalous histograms from standard ones.