In [1]:
import glob, h5py, math, time, os, json, argparse, datetime
import numpy as np
from FLKutils import *
from SampleUtils import *

import matplotlib as mpl
#mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
plt.rcParams["font.family"] = "serif"
plt.style.use('classic')

# Problem setup

In [38]:
# labels identifying the signal classes in the dataset
sig_labels=[3]
# labels identifying the background classes in the dataset
bkg_labels=[0, 1, 2]

# hyper parameters of the NPLM model based on kernel methods
## number of kernels
M = 1000 

## percentile of the distribution of pair-wise distances between reference-distributed points
flk_sigma_perc=90 

## L2 regularization coefficient
lam =1e-6 

## number of maximum iterations before the training is killed
iterations=1000000 

## number of toys to simulate 
## (multiple experiments allow you to build a statistics for the test and quantify type I and II errors)
Ntoys = 10 

## details about the save path
folder_out = './out/'
sig_string = ''
if N_sig:
    sig_string+='_SIG'
    for s in sig_labels:
        sig_string+='-%i'%(s)
NP = '%s_NR%i_NB%i_NS%i_M%i_lam%s_iter%i/'%(sig_string, N_ref, N_bkg, N_sig,
                                                  M, str(lam), iterations)
if not os.path.exists(folder_out+NP):
    os.makedirs(folder_out+NP)

# Data loading

In [34]:
a = np.load('./predictions/simclr_predictions.npz')

In [30]:
a.files

['instances',
 'dim2',
 'dim4',
 'dim8',
 'dim16',
 'dim24',
 'dim8_embedding',
 'dim2_embedding',
 'dim16_embedding',
 'dim24_embedding',
 'dim32_embedding']

In [43]:
target=a['dim8']

In [44]:
np.unique(target)

array([0, 1, 2, 3])

In [46]:
a['dim8_embedding'].shape

(20, 2048, 4)

In [47]:
features=a['dim8_embedding'].reshape((-1, 4))

In [48]:
features.shape

(40960, 4)

In [49]:
target.shape

(40960,)

In [None]:
############ begin load data
# This part needs to be modified according to how the predictions of your model are stored.
# Here the predictions are saved in npz files
print('Load data')
folder = './predictions/simclr_predictions.npz'
data= {}
for file in glob.glob(folder+'/*.npz'):
    filename = file.split('/')[-1].replace('.npz', '')
    data[filename]= {}
    file_load = np.load(file)
    for k2 in file_load.files:
        data[filename][k2] = file_load[k2]

features = #...
target = #...



In [50]:
# select SIG and BKG classes
mask_SIG = np.zeros_like(target)
mask_BKG = np.zeros_like(target)
for sig_label in sig_labels:
    mask_SIG += 1*(target==sig_label)
for bkg_label in bkg_labels:
    mask_BKG += 1*(target==bkg_label)

features_SIG = features[mask_SIG>0]
features_BKG = features[mask_BKG>0]
############ end load data

In [51]:
######## standardizes data
print('standardize')
features_mean, features_std = np.mean(features_BKG, axis=0), np.std(features_BKG, axis=0)
print('mean: ', features_mean)
print('std: ', features_std)
features_BKG = standardize(features_BKG, features_mean, features_std)
features_SIG = standardize(features_SIG, features_mean, features_std)



standardize
mean:  [-2.13186052 -2.12510567 -2.50298683 -3.32404615]
std:  [0.96278889 0.76894639 0.7268368  0.35038829]


In [52]:
#### compute sigma hyper parameter from data
#### sigma is the gaussian kernels width. 
#### Following a heuristic, we set this hyperparameter to the 90% quantile of the distribution of pair-wise distances between bkg-distributed points
#### (see below)
#### This doesn't need modifications, but one could in principle change it (see https://arxiv.org/abs/2408.12296)
flk_sigma = candidate_sigma(features_BKG[:2000, :], perc=flk_sigma_perc)
print('flk_sigma', flk_sigma)

flk_sigma 4.0


# Training

## null hypothesis

In [53]:
N_ref = 10000 # number of reference datapoints (mixture of non-anomalous classes)
N_bkg = 1000 # number of backgroun datapoints in the data (mixture of non-anomalous classes present in the data)
N_sig = 0 # number of signal datapoints in the data (mixture of anomalous classes present in the data)
w_ref = N_bkg*1./N_ref

In [56]:
## run toys
print('Start running toys')
t0=np.array([])
seeds = np.random.uniform(low=1, high=100000, size=(Ntoys,))
for i in range(Ntoys):
    seed = int(seeds[i])
    rng = np.random.default_rng(seed=seed)
    N_bkg_p = rng.poisson(lam=N_bkg, size=1)[0]
    N_sig_p = rng.poisson(lam=N_sig, size=1)[0]
    rng.shuffle(features_SIG)
    rng.shuffle(features_BKG)
    features_s = features_SIG[:N_sig_p, :]
    features_b = features_BKG[:N_bkg_p+N_ref, :]
    features  = np.concatenate((features_s,features_b), axis=0)

    label_R = np.zeros((N_ref,))
    label_D = np.ones((N_bkg_p+N_sig_p, ))
    labels  = np.concatenate((label_D,label_R), axis=0)
    
    plot_reco=False
    verbose=False
    # make reconstruction plots every 20 toys (can be changed)
    if not i%20:
        plot_reco=True
        verbose=True
    flk_config = get_logflk_config(M,flk_sigma,[lam],weight=w_ref,iter=[iterations],seed=None,cpu=False)
    t, pred = run_toy('t0', features, labels, weight=w_ref, seed=seed,
                      flk_config=flk_config, output_path='./', plot=plot_reco, savefig=plot_reco,
                      verbose=verbose)
    
    t0 = np.append(t0, t)

Start running toys


ImportError: cannot import name 'packaging' from 'pkg_resources' (/opt/conda/lib/python3.10/site-packages/pkg_resources/__init__.py)

In [57]:
torch.__version__

'2.0.0.post200'

In [58]:
torch.version.cuda

'11.2'

In [None]:
## alternative hypothesis (signal injection)

In [None]:
N_ref = 100000 # number of reference datapoints (mixture of non-anomalous classes)
N_bkg = 10000 # number of backgroun datapoints in the data (mixture of non-anomalous classes present in the data)
N_sig = 100 # number of signal datapoints in the data (mixture of anomalous classes present in the data)
w_ref = N_bkg*1./N_ref

In [None]:
## run toys
print('Start running toys')
t1=np.array([])
seeds = np.random.uniform(low=1, high=100000, size=(Ntoys,))
for i in range(Ntoys):
    seed = seeds[i]
    rng = np.random.default_rng(seed=seed)
    N_bkg_p = rng.poisson(lam=N_bkg, size=1)[0]
    N_sig_p = rng.poisson(lam=N_sig, size=1)[0]
    rng.shuffle(features_SIG)
    rng.shuffle(features_BKG)
    features_s = features_SIG[:N_sig_p, :]
    features_b = features_BKG[:N_bkg_p+N_ref, :]
    features  = np.concatenate((features_s,features_b), axis=0)

    label_R = np.zeros((N_ref,))
    label_D = np.ones((N_bkg_p+N_sig_p, ))
    labels  = np.concatenate((label_D,label_R), axis=0)
    
    plot_reco=False
    verbose=False
    # make reconstruction plots every 20 toys (can be changed)
    if not i%20:
        plot_reco=True
        verbose=True
    flk_config = get_logflk_config(M,flk_sigma,[lam],weight=w_ref,iter=[iterations],seed=None,cpu=False)
    t, pred = run_toy(manifold, features, labels, weight=w_ref, seed=seed,
                      flk_config=flk_config, output_path='./', plot=plot_reco, savefig=plot_reco,
                      verbose=verbose)
    
    t1 = np.append(t1, t)

# Plot results

In [None]:
def Z_score_chi2(t,df):
    sf = chi2.sf(t, df)
    Z  = -norm.ppf(sf)
    return Z

def Z_score_norm(t,mu, std):
    sf = norm.sf(t, mu, std)
    Z  = -norm.ppf(sf)
    return Z

def plot_1distribution(t, df, xmin=0, xmax=300, nbins=10, save=False, ymax=None, output_path='', save_name='', label=''):
    '''
    Plot the histogram of a test statistics sample (t) and the target chi2 distribution (df must be specified!). 
    The median and the error on the median are calculated in order to calculate the median Z-score and its error.
    '''
    plt.rcParams["font.family"] = "serif"
    plt.style.use('classic')
    fig  = plt.figure(figsize=(12, 9))
    fig.patch.set_facecolor('white')
    # plot distribution histogram
    bins      = np.linspace(xmin, xmax, nbins+1)
    Z_obs     = norm.ppf(chi2.cdf(np.median(t), df))
    t_obs_err = 1.2533*np.std(t)*1./np.sqrt(t.shape[0])
    Z_obs_p   = norm.ppf(chi2.cdf(np.median(t)+t_obs_err, df))
    Z_obs_m   = norm.ppf(chi2.cdf(np.median(t)-t_obs_err, df))
    label  = 'sample %s\nsize: %i \nmedian: %s, std: %s\n'%(label, t.shape[0], str(np.around(np.median(t), 2)),str(np.around(np.std(t), 2)))
    label += 'Z = %s (+%s/-%s)'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
    binswidth = (xmax-xmin)*1./nbins
    h = plt.hist(t, weights=np.ones_like(t)*1./(t.shape[0]*binswidth), color='lightblue', ec='#2c7fb8',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='#2c7fb8', marker='o', ls='')
    # plot reference chi2
    x  = np.linspace(chi2.ppf(0.0001, df), chi2.ppf(0.9999, df), 100)
    plt.plot(x, chi2.pdf(x, df),'midnightblue', lw=5, alpha=0.8, label=r'$\chi^2_{%i}$'%(df))
    font = font_manager.FontProperties(family='serif', size=14) 
    plt.legend(prop=font, frameon=False)
    plt.xlabel('t', fontsize=18, fontname="serif")
    plt.ylabel('Probability', fontsize=18, fontname="serif")
    plt.yticks(fontsize=16, fontname="serif")
    plt.xticks(fontsize=16, fontname="serif")
    if ymax !=None:
        plt.ylim(0., ymax)
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            plt.savefig(output_path+ save_name+'_distribution.pdf')
            print('saved at %s'%(output_path+ save_name+'_distribution.pdf'))
    plt.show()
    plt.close(fig)
    return

def plot_2distribution(t1, t2, df, xmin=0, xmax=300, ymax=None, nbins=10, save=False, output_path='', label1='1', label2='2', save_name='', print_Zscore=True):
    '''
    Plot the histogram of a test statistics sample (t) and the target chi2 distribution (df must be specified!).
    The median and the error on the median are calculated in order to calculate the median Z-score and its error.
    '''
    plt.rcParams["font.family"] = "serif"
    plt.style.use('classic')
    fig  = plt.figure(figsize=(12, 9))
    fig.patch.set_facecolor('white')
    # plot distribution histogram
    bins      = np.linspace(xmin, xmax, nbins+1)
    binswidth = (xmax-xmin)*1./nbins
    # t1
    Z_obs     = Z_score_chi2(np.median(t1), df)
    t_obs_err = 1.2533*np.std(t1)*1./np.sqrt(t1.shape[0])
    Z_obs_p   = Z_score_chi2(np.median(t1)+t_obs_err, df)
    Z_obs_m   = Z_score_chi2(np.median(t1)-t_obs_err, df)
    label  = '%s \nsize: %i\nmedian: %s, std: %s\n'%(label1, t1.shape[0], str(np.around(np.median(t1), 2)),str(np.around(np.std(t1), 2)))
    if print_Zscore:
        label += 'asymptotic Z = %s (+%s/-%s)'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
    
    h = plt.hist(t1, weights=np.ones_like(t1)*1./(t1.shape[0]*binswidth), color='lightblue', ec='#2c7fb8',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t1.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='#2c7fb8', marker='o', ls='')
    max1 = np.max(h[0])
    # t2
    Z_obs     = Z_score_chi2(np.median(t2), df)
    t_obs_err = 1.2533*np.std(t2)*1./np.sqrt(t2.shape[0])
    Z_obs_p   = Z_score_chi2(np.median(t2)+t_obs_err, df)
    Z_obs_m   = Z_score_chi2(np.median(t2)-t_obs_err, df)
    t_empirical = np.sum(1.*(t1>np.mean(t2)))*1./t1.shape[0]
    empirical_lim = '='
    if t_empirical==0:
        empirical_lim='>'
        t_empirical = 1./t1.shape[0]
    t_empirical_err = t_empirical*np.sqrt(1./np.sum(1.*(t1>np.mean(t2))+1./t1.shape[0]))
    Z_empirical = norm.ppf(1-t_empirical)
    Z_empirical_m = norm.ppf(1-(t_empirical+t_empirical_err))
    Z_empirical_p = norm.ppf(1-(t_empirical-t_empirical_err))
                                          
    label  = '%s \nsize: %i\nmedian: %s, std: %s\n'%(label2, t2.shape[0], str(np.around(np.median(t2), 2)),str(np.around(np.std(t2), 2)))
    if print_Zscore:
        label += 'asymptotic Z = %s (+%s/-%s) \n'%(str(np.around(Z_obs, 2)), str(np.around(Z_obs_p-Z_obs, 2)), str(np.around(Z_obs-Z_obs_m, 2)))
        label += 'empirical Z %s %s (+%s/-%s)'%(empirical_lim, str(np.around(Z_empirical, 2)), str(np.around(Z_empirical_p-Z_empirical, 2)), str(np.around(Z_empirical-Z_empirical_m, 2)))
    h = plt.hist(t2, weights=np.ones_like(t2)*1./(t2.shape[0]*binswidth), color='#8dd3c7', ec='seagreen',
                 bins=bins, label=label)
    err = np.sqrt(h[0]/(t2.shape[0]*binswidth))
    x   = 0.5*(bins[1:]+bins[:-1])
    plt.errorbar(x, h[0], yerr = err, color='seagreen', marker='o', ls='')
    max2 = np.max(h[0])
    # plot reference chi2
    x  = np.linspace(chi2.ppf(0.0001, df), chi2.ppf(0.9999, df), 100)
    plt.plot(x, chi2.pdf(x, df),'midnightblue', lw=5, alpha=0.8, label=r'$\chi^2_{%i}$'%(df))
    font = font_manager.FontProperties(family='serif', size=20) #weight='bold', style='normal', 
    plt.legend(ncol=1, loc='upper right', prop=font, frameon=False)
    plt.xlabel('$t$', fontsize=32, fontname="serif")
    plt.ylabel('Probability', fontsize=32, fontname="serif")
    plt.ylim(0., 1.2*np.maximum(max1, max2))#np.max(chi2.pdf(x, df))*1.3)
    if ymax !=None:
        plt.ylim(0., ymax)
    plt.yticks(fontsize=22, fontname="serif")
    plt.xticks(fontsize=22, fontname="serif")
    if save:
        if output_path=='':
            print('argument output_path is not defined. The figure will not be saved.')
        else:
            plt.savefig(output_path+ save_name+'_2distribution.pdf')
    plt.show()
    plt.close()
    return #[Z_obs, Z_obs_p, Z_obs_m], [Z_empirical, Z_empirical_p, Z_empirical_m]

In [None]:
# plot null distribution
plot_1distribution(t0, df=np.mean(t0), xmin=0.9*np.min(t0), xmax=60, nbins=16, save=False, ymax=None, output_path='', save_name='', label=k)

In [None]:
# plot alternative vs null distributions
plot_2distribution(t0, t1, df=np.mean(t0), xmin=np.min(t0), xmax=np.max(t1)*1.1, #ymax=0.03, 
                   nbins=19, label1='REF', label2='BKG+SIG', print_Zscore=True,
                   save=False, output_path=folders[k], save_name='')