In [None]:
#get the good data

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval
import json

df_in= pd.read_csv('<your csv>.csv') #take your csv, or read from .zip

df=df_in.loc[(df_in.hname=='<your histogram') & (df_in.entries > 1000)].copy() #select your histogram, choose only the ones with a significant number of entries

df['histo']=df['histo'].apply(literal_eval) # convert the string histo, in a list of numerical values

df.set_index(['fromrun','fromlumi'], inplace=True, drop=False)
df.sort_index(inplace=True)

print(df.head())
print(df.shape)

In [None]:
#this cell is used only on Tracking histograms which have entries with the same name in the same (run, ls), we pick up the first entry

#df = df.groupby(df.index).agg({'fromrun':'first', 'fromlumi':'first', 'histo':'first','hname':'first','entries':'first'})

#df.set_index(['fromrun','fromlumi'], inplace=True, drop=False)
#df.sort_index(inplace=True)

#print(df.head())
#print(df.shape)

In [None]:
#example plot function
def PlotHist(run,ls):

    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    hname="chargeInner_PXLayer_1"
    
    global df
    
    ahisto=df['histo'][run][ls]
    
    x= np.linspace(Xmin,Xmax,Xbins)
    plt.xlim(Xmin,Xmax)
    
    plt.step(x, ahisto, where='mid', label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
    plt.xlabel("Charge electrons")
    plt.ylabel("A.U")
    plt.legend()
    plt.savefig('chargeInner_PXLayer_1_GOOD.png')
    
PlotHist(301998,505)

In [None]:
#if needed rebin the histo
def rebinHisto(df,k):
    
    hlist=df['histo']
    size=len(hlist[0])
    new_hlist=[]
    
    for histo in hlist:
        new_histo=[]
    
        if (size%k !=0):
            print("k must be a submultiple of " + str(size))
            return new_histo
    
        if (size%k==0):
            for i in range(0, size/k):
                newbin=0
                for j in range(i*k,(i+1)*k):
                    newbin=newbin + histo[j]
                new_histo.append(newbin)
        new_hlist.append(new_histo)
    return(new_hlist)

#To be used only if a rebin of the histogram is necessary!!
#df['histo_rebin'] = rebinHisto(df,10)
#print(df.head())
#print(len(df.histo[0]))
#print(len(df.histo_rebin[0]))

In [None]:
#function to check if the rebinning is ok
def checkRebin(df):
   
    Xmax=2999.5
    Xmin=-.5
    Xbins=600
    Xbins2=60
    hname="Number of hits"
    
    x1= np.linspace(Xmin,Xmax,Xbins)
    x2=np.linspace(Xmin,Xmax,Xbins2)
    
    
    
    for run in df['fromrun'].unique():
        
        fig, (ax1,ax2)=plt.subplots(1,2)
        plt.xlim(Xmin,Xmax)
        fig.set_size_inches(16, 9)
        
        for ls in df['fromlumi'][run]: 
            plt.title(str(run))
            ahisto=df['histo'][run][ls]
            ahisto2=df['histo_rebin'][run][ls]
            
            ax1.step(x1, ahisto, where='mid') #,label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
            plt.xlabel("N tracks")
            plt.ylabel("A.U")

            ax2.step(x2, ahisto2, where='mid') #,label=(" LS " + str(df.fromlumi[run][ls]) + " Run " + str(df.fromrun[run][ls]) ))
            plt.title(str(run))
            plt.xlabel("N tracks")
            plt.ylabel("A.U")
        #plt.legend()
        plt.show()
    #plt.savefig('chargeInner_PXLayer_1_GOOD.png')

In [None]:
#add Golden JSON labels to the DF
import json

def checkLS(run,ls):
    isok=False
    global jsondata
    
    if str(run) in jsondata.keys():
        for i in data[str(run)]:
            if (ls>=i[0] and ls <=i[1]):
                isok=True
                return isok
        return isok

#load the golden json file
jsondata={}
with open('<GOLDEN JSON>') as json_file:
    jsondata = json.load(json_file)

df['labels']=False #initialize to false

for run in df['fromrun'].unique():
    for ls in df['fromlumi'][run]:
        df['labels'][run][ls]=checkLS(run,ls)

print(df[df['labels']==True]) #to check against the Golden JSON

In [None]:
#Train on long json runs and test on the rest (RECOMMENDED)

df_train=df.loc[df['labels']==True]
df_test=df.loc[df['labels']==False]

X_trainS = np.stack(df_train['histo_rebin'].values, axis=0) #convert list of array to a stack to feed the model
X_testS = np.stack(df_test['histo_rebin'].values, axis=0)

from sklearn.preprocessing import normalize

X_train=normalize(X_trainS, norm='l1', axis=1) #normalise the sample, i.e the rows
X_test=normalize(X_testS, norm='l1', axis=1)

df_train['original']=X_train.tolist()
df_test['original']=X_test.tolist()

print(df_train.head())
print(df_test.head())

In [None]:
#alternative way to extract test and train data and put in dataframes

#from sklearn.preprocessing import normalize

#X_train, X_test, y_train, y_test = train_test_split(df['histo'].values,df['labels'].values, test_size=.2, random_state=21)
#X_trainI, X_testI, y_trainI, y_testI = train_test_split(df['histo'].index,df['labels'].values, test_size=.2, random_state=21)

#X_trainS = np.stack(X_train, axis=0) #convert list of array to a stack to feed the model
#X_testS = np.stack(X_test, axis=0)

#print(type(X_trainS[0]))

#X_train=normalize(X_trainS, norm='l1', axis=1) #normalise the sample, i.e the rows
#X_test=normalize(X_testS, norm='l1', axis=1)

#print(X_test[10].sum()) #verify rows are normalized

#df_test=df.loc[X_testI]
#df_train=df.loc[X_trainI]

#df_test.set_index(['fromrun','fromlumi'], inplace=True, drop=False)
#df_test.sort_index(inplace=True)

#df_train.set_index(['fromrun','fromlumi'], inplace=True, drop=False)
#df_train.sort_index(inplace=True)

#df_train['original']=X_train.tolist()
#df_test['original']=X_test.tolist()

#print(df_test.head())
#print(df_train.head())

In [None]:
#build the model
import math
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Input, Dense
from keras.layers.advanced_activations import PReLU
from keras.models import Model, load_model
from keras import backend as K
import tensorflow as tf

def mseTop10(y_true, y_pred):
    top_values, _ = tf.nn.top_k(K.square(y_pred - y_true), k=10, sorted=True)
    mean=K.mean(top_values, axis=-1)
    return K.sqrt(mean)

input_size=len(X_train[0])

input_layer = Input(shape=(input_size, ))

encoded = Dense(10, activation='tanh')(input_layer)
encoded1 = Dense(3, activation='tanh')(encoded)
encoded2 = Dense(10, activation='tanh')(encoded1)
decoder = Dense(input_size, activation='tanh')(encoded2)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss=mseTop10)

for i, layer in enumerate(autoencoder.layers):
             layer.name = 'layer_' + str(i)

autoencoder.summary()

In [None]:
#fit the model
history = autoencoder.fit(X_train,X_train, epochs=100, batch_size=1, shuffle=False, verbose=2,validation_split=0.1)

In [None]:
#print(history)

line_styles = [(0, ()), (0, (1, 3)), (0, (3, 2))]

color_palette = {"Indigo": {
                    50: "#E8EAF6",
                    100: "#C5CAE9",
                    200: "#9FA8DA",
                    300: "#7986CB",
                    400: "#5C6BC0",
                    500: "#3F51B5",
                    600: "#3949AB",
                    700: "#303F9F",
                    800: "#283593",
                    900: "#1A237E"},
                 "Teal": {      
                    50: "#E0F2F1",
                    100: "#B2DFDB",
                    200: "#80CBC4",
                    300: "#4DB6AC",
                    400: "#26A69A",
                    500: "#009688",
                    600: "#00897B",
                    700: "#00796B",
                    800: "#00695C",
                    900: "#004D40"}
                }

def plot_loss(data, title):
    """Plots the training and validation loss"""
    plt.figure()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(title)
    plt.plot(data.history['loss'], linestyle=line_styles[0], color=color_palette["Indigo"][900], linewidth=3)
    plt.plot(data.history['val_loss'], linestyle=line_styles[2], color=color_palette["Teal"][300], linewidth=3)
    plt.legend(["Train", "Validation"], loc="upper right", frameon=False)
    #plt.ylim(8258339,8258400)
    plt.yscale("log")
    plt.xlim(0,10)
    plt.show();

plot_loss(history, "Original model loss")

In [None]:
#get predictions and mse and add to the df

predictionTest=autoencoder.predict(X_test)
predictionTrain=autoencoder.predict(X_train)

mseTest=K.eval(mseTop10(X_test, predictionTest))
mseTrain=K.eval(mseTop10(X_train, predictionTrain))

df_train['prediction']=predictionTrain.tolist()
df_test['prediction']=predictionTest.tolist()

df_train['mse']=mseTrain
df_test['mse']=mseTest

In [None]:
print(df_test['mse'].describe())
#print(df_test.head())

print(df_train['mse'].describe())

In [None]:
def plotAnomalies(df,run, lumi,thr=3):
    
    mse=df['mse'][run][lumi]
    mseMean=0.001465 # taken from test #df['mse'].mean()
    mseStd=0.000818 #df['mse'].mean()
    
    if (mse < (mseMean+(thr*mseStd))) : return
    
    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    title="chargeInner_PXLayer_1"
    
    histo_data=df['original'][run][lumi]
    histo_reco=df['prediction'][run][lumi]
    
    x= np.linspace(Xmin,Xmax,Xbins)
    plt.title(title)
    
    plt.xlim(Xmin,Xmax)
    plt.step(x, histo_data, where='mid', label=("Data Run: " + str(run) + " LS: " + str(lumi)))
    plt.step(x, histo_reco, where='mid', label=("Reco MSE: " + str(mse)))
    
    plt.legend()
    plt.show()

In [None]:
def dumpAnomalies(df,run,thr=3.0):
    
    #mse=df['mse'][run][lumi]
    mseMean=0.001911 # taken from test #df['mse'].mean()
    mseStd=0.000659 #df['mse'].mean()
    
    limit=(mseMean+(thr*mseStd))
    
    print(df['fromlumi'].loc[(df['mse']>limit) & (df['fromrun']==run)])

In [None]:
def perRunMSE(df_test,run):

    mseMean=0.001911 # taken from test #df['mse'].mean()
    mseStd=0.000659 #df['mse'].mean()
    thr=3.0

    #run=300122

    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    hname="chargeInner_PXLayer_1"

    limit=(mseMean+(thr*mseStd))

    print(" Run: " + str(run) + " list of LS with MSE > " + str(limit))
    dumpAnomalies(df_test,run)
    mseTrendPlot(df_test,run)

    fig, (ax1,ax2) =plt.subplots(ncols=2)
    fig.set_size_inches(8, 4)

    x= np.linspace(Xmin,Xmax,Xbins)
    plt.xlim(Xmin,Xmax)
    plt.xlabel("Charge electrons")
    plt.ylabel("A.U")

    array=np.array(df_test['fromlumi'].loc[(df_test['mse'] < limit) & (df_test['fromrun']==run)])
    
    ax1.set_title("standard")
    ax2.set_title("anomalous")
    
    for i in array:
        histo=df_test['histo'][run][i]
        ax1.step(x, histo, where='mid', label=(" LS " + str(df_test.fromlumi[run][i]) + " Run " + str(df_test.fromrun[run][i]) ))

    array=np.array(df_test['fromlumi'].loc[(df_test['mse'] >= limit) & (df_test['fromrun']==run)])

    for i in array:
        histo=df_test['histo'][run][i]
        ax2.step(x, histo, where='mid', label=(" LS " + str(df_test.fromlumi[run][i]) + " Run " + str(df_test.fromrun[run][i]) ), linestyle=":")

    plt.show()

In [None]:
def mseTrendPlot(df,run):
    
    x=np.array(df['fromlumi'][run])
    y=np.array(df['mse'][run])
    
    mean=y.mean()
    sd=y.std()
    
    smean=0.001911 # taken from test #df['mse'].mean()
    ssd=0.000659 #taken from test sample
    
    fig=plt.figure()
    fig.set_size_inches(8, 4)
    
    plt.hlines(mean,x.min(),x.max(), color="blue", label="Run average")
    plt.hlines(mean+(3*sd), x.min(), x.max(), color="red")
    
    plt.hlines(smean,x.min(),x.max(), color="blue", label="Global average", linestyle=":")
    plt.hlines(smean+(3.0*ssd), x.min(), x.max(), color="red", linestyle=":")
    
    plt.ylim(0, y.max()*1.2)
    plt.scatter(x,y,marker='.', label=str(run))
    plt.legend()
    plt.show()
    

In [None]:
def mseHisto(df):
    w=np.array(df['mse'])
    plt.hist(w,bins=100,range=(w.min()*0.8,(w.max()*1.2)), alpha=0.5, color="green", label="MSE")
    plt.yscale('log')
    plt.legend()
    plt.show()

In [None]:
for run in df_test["fromrun"].unique():
    for ls in df_test['fromlumi'][run]:
        #print(run, ls)
        plotAnomalies(df_test, run, ls)

In [None]:
#plot train
mseHisto(df_test)
for run in df_test["fromrun"].unique():
        mseTrendPlot(df_test, run)

In [None]:
def mseTrendPlotPerLS(df):
    
    fig=plt.figure()
    fig.set_size_inches(16, 9)
    
    smean=df['mse'].mean()
    ssd=df['mse'].std() #taken from test sample
    
    #plt.hlines(smean,x.min(),x.max(), color="blue", label="Global average", linestyle=":")
    #plt.hlines(smean+(3.0*ssd), x.min(), x.max(), color="red", linestyle=":")
    
    for run in df['fromrun'].unique():
        
        x=np.array(df['fromlumi'][run])
        y=np.array(df['mse'][run])
    
        col=np.array(df['labels'][run]).astype(float)
        colors=[]
    
        for i in col:
            if i : colors.append("green")
            else : colors.append("red")
        #colors=(colors*10)+20
    
        #print(colors)
    
        mean=y.mean()
        sd=y.std()
        plt.scatter(x,y,marker='+', c=colors)
    
    plt.ylim(0,smean+(5*ssd)) 
    
    #plt.legend()
    plt.show()

In [None]:
def globalMSETrend(df):
    
    y=[]
    
    rmax=0
    rmin=10
    
    colors=[]
    
    print(rmax, rmin)
    for run in df['fromrun'].unique():
        val=(df['mse'][run]).mean()
        
        if (df['labels'][run].max()>0): colors.append('green')
        else : colors.append("red") 
        
        if val > rmax: rmax=val
        if val < rmin: rmin =val
        y.append(val)
    
    array=np.array(y)
    gmean=array.mean()
    size=float(array.size)
    gstd=array.std()
    print(size)
    
    fig=plt.figure()
    fig.set_size_inches(16, 9)
    
    plt.hlines(gmean,df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="blue", label="Run average: " + str(gmean))
    plt.hlines(gmean+(1.0*gstd), df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="red", label='1 SD (' + str(gstd) + ")")
    plt.hlines(gmean+(3.0*gstd), df['fromrun'].unique().min(), df['fromrun'].unique().max(), color="red", label='3 SD', linestyle=':')
    
    plt.ylim(gmean-0.5*gstd,gmean+0.5*gstd)
    plt.scatter(df['fromrun'].unique(), y, marker='+', label='Data points', c=colors)
    plt.xlabel("Run")
    plt.ylabel("average MSE")
    plt.legend()
    
    plt.show()

In [None]:
def perRunMSE(df_test,run):

    mseMean=0.001911 # taken from test #df['mse'].mean()
    mseStd=0.000659 #df['mse'].mean()
    thr=3.0

    #run=300122

    Xmax=80000.0
    Xmin=0.0
    Xbins=100
    hname="chargeInner_PXLayer_1"

    limit=(mseMean+(thr*mseStd))

    print(" Run: " + str(run) + " list of LS with MSE > " + str(limit))
    dumpAnomalies(df_test,run)
    mseTrendPlot(df_test,run)

    fig, (ax1,ax2) =plt.subplots(ncols=2)
    fig.set_size_inches(8, 4)

    x= np.linspace(Xmin,Xmax,Xbins)
    plt.xlim(Xmin,Xmax)
    
    array=np.array(df_test['fromlumi'].loc[(df_test['mse'] < limit) & (df_test['fromrun']==run)])
    
    ax1.set_title("standard")
    ax2.set_title("anomalous")
    
    for i in array:
        histo=df_test['histo'][run][i]
        ax1.xlabel("Charge electrons")
        ax1.ylabel("A.U")
        ax1.step(x, histo, where='mid', label=(" LS " + str(df_test.fromlumi[run][i]) + " Run " + str(df_test.fromrun[run][i]) ))

    array=np.array(df_test['fromlumi'].loc[(df_test['mse'] >= limit) & (df_test['fromrun']==run)])

    for i in array:
        histo=df_test['histo'][run][i]
        ax2.xlabel("Charge electrons")
        ax2.ylabel("A.U")
        ax2.step(x, histo, where='mid', label=(" LS " + str(df_test.fromlumi[run][i]) + " Run " + str(df_test.fromrun[run][i]) ), linestyle=":")

    plt.show()

In [None]:
#to save a model to json file
model_json = autoencoder.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)