In [1]:
# import system modules
import sys
import os
os.system('for a in /sys/bus/pci/devices/*; do echo 0 | tee -a $a/numa_node>/dev/null; done') # get rid of NUMA node warnings in my docker: https://github.com/tensorflow/tensorflow/issues/42738
import gc

# import standard numerical modules
import numpy as np
import math

# import machine learning modules
import tensorflow as tf
import keras.backend as K

gpu = tf.config.list_physical_devices('GPU') # make sure GPU usage is enabled
print(gpu) 

sys.path.append('../20240503')
import DCTR


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# define conversion function

In [2]:
from madgraph.various.lhe_parser import EventFile # madgraph EventFile for getting event weights from lhe

def convert_valentinas_datasets_to_mine(tt_npz_list, top_npz_list, lhe_list, theta):
    '''converts valentinas dataset layout and shape into mine.
    valentinas: tt:  [pt, y, eta, phi, mass, E]
                top: [pt, y, eta, phi, mass, PID]
    mine:            [pt, y, phi, mass, eta, E, PID, w, theta]
                     [0 , 1, 2  , 3   , 4  , 5, 6  , 7, 8    ]

    args:
        tt_npz_list

    no event weights! We need to load lhe to find event weight.
    '''
    assert len(tt_npz_list) == len(top_npz_list) == len(lhe_list), 'Lengths of lists must be equal!'

    # create mapping from valentinas to mine
    # values that are not in the org list are kept in their place, as they will be appended in the correct order
    target_order  = ['pt', 'y', 'phi', 'mass', 'eta', 'E', 'PID', 'w', 'theta'] # my order
    org_order_tt  = ['pt', 'y', 'eta', 'phi', 'mass', 'E', 'PID', 'w', 'theta'] # tt Valentina
    org_order_top = ['pt', 'y', 'eta', 'phi', 'mass', 'PID', 'E', 'w', 'theta'] # top Valentina
    mapping_tt = [org_order_tt.index(obs) for obs in target_order]
    mapping_top = [org_order_top.index(obs) for obs in target_order]
    
    out = []
    skipped = []
    for i, _ in enumerate(tt_npz_list):
        # check if files exist
        if not os.path.exists(tt_npz_list[i]):
            print(f"file: {tt_npz_list[i]} doesn't exist; skipping")
            skipped.append(lhe_list[i])
            continue
        elif not os.path.exists(top_npz_list[i]):
            print(f"file: {top_npz_list[i]} doesn't exist; skipping")
            skipped.append(lhe_list[i])
            continue
        elif not os.path.exists(lhe_list[i]):
            print(f"file: {lhe_list[i]} doesn't exist; skipping")
            skipped.append(lhe_list[i])
            continue
        
        # get event weights from lhe file
        wgt = []
        lhe = EventFile(lhe_list[i]) # uses madgraphs EventFile function to open the lhe file
        for event in lhe:
            w = event.wgt
            wgt.append(w)
        # print(f'{np.shape(wgt) = }')
        
        # load tt-dataset
        tt = []
        tt = np.load(tt_npz_list[i])['a']
        # print(f'{np.shape(tt) = }')
        
        # load top (and anti-top) dataset
        top = []
        top = np.load(top_npz_list[i])['a']
        # print(f'{np.shape(top) = }')

        # check that lhe file and converted npz contain same number of events
        if len(tt) != len(wgt) != len(top):
            print(f'different number of events in files! \n' + 
                  f'tt : len: {len(tt)} | file: {tt_npz_list[i]} \n' +
                  f'top: len: {len(top)} | file: {top_npz_list[i]} \n' +
                  f'lhe: len: {len(wgt)} | file: {lhe_list[i]} \n' +
                  f'skipping files {i+1}')
            skipped.append(lhe_list[i])
            continue
        elif len(wgt) <= 10: # lhe file is broken
            print(f'file: {lhe_list[i]} only has {len(wgt)} events; skipping')
            skipped.append(lhe_list[i])
            continue
        

        # pad arrays into correct final shape
        tt = np.pad(tt, [(0,0), (0,0), (0, 3)])
        top = np.pad(top, [(0,0), (0,0), (0, 3)])   
        # print(f'shape after padding {np.shape(tt) = }')
        # print(f'shape after padding {np.shape(top) = }') 

        # re-sort datasets
        tt = tt[..., mapping_tt] 
        top = top[..., mapping_top]

        # concat to have tt-pair and top and anti-top for each event
        concat = []
        concat = np.concatenate((tt, top), axis=1)
        # print(f'{np.shape(concat) = }')
        
        # add wgt
        wgt = [wgt] * 3 # every particle has same wgt (as it is an event wgt)
        concat[..., 7] = np.transpose(wgt)
        
        # add theta
        theta_ = np.full_like(concat[..., 8], theta)
        concat[..., 8] = theta_
        
        # add to out
        if i == 0:
            out = concat
        else:
            out = np.concatenate((out, concat), axis = 0)
        print(f'finished processing file {i+1:0>4}', end='\r')
    
    return out, skipped


In [4]:
# load showered .npz events 

# X1 NNLO MiNNLO
X1_dir = '/tf/data/BachelorThesis_Data/showered/MiNNLO'
X1_lhe_dir = '/tf/data/BachelorThesis_Data/LHE/data/MiNNLO'
num_train = 1000 # number of lhe files

shower_tt_list = [''] * num_train
shower_top_list = [''] * num_train
lhe_list = [''] * num_train
for i in range(num_train):
    shower_tt_list[i] = f'{X1_dir}/ShowerTTR_seed{i+1}_0.855.npz'
    shower_top_list[i] = f'{X1_dir}/ShowerTopR_seed{i+1}_0.855.npz'
    # find lhe file
    for root, dirs, files in os.walk(X1_lhe_dir):
        for name in files:
            if name == f'pwgevents-{i+1:0>4}.lhe':
                lhe_list[i] = f'{root}/{name}'

print(*shower_tt_list[:-1:250], sep='\n')
print(*shower_top_list[:-1:250], sep='\n')
print(*lhe_list[:-1:250], sep='\n')


/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed1_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed251_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed501_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed751_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed1_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed251_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed501_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed751_0.855.npz
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0001.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0251.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0501-1000/pwgevents-0501.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0501-1000/pwgevents-0751.lhe


In [5]:
X1, skipped = convert_valentinas_datasets_to_mine(shower_tt_list, shower_top_list, lhe_list, theta = 1)

print(f'{skipped = }')
print(f'{np.shape(X1) = }')


different number of events in files! 
tt : len: 7000 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed75_0.855.npz 
top: len: 7000 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed75_0.855.npz 
lhe: len: 6999 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0075.lhe 
skipping files 75
different number of events in files! 
tt : len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed107_0.855.npz 
top: len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed107_0.855.npz 
lhe: len: 7998 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0107.lhe 
skipping files 107
different number of events in files! 
tt : len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_seed108_0.855.npz 
top: len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_seed108_0.855.npz 
lhe: len: 7998 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/00

In [6]:
# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/MiNNLO/showered/converted_lhe.npy', X1)

with open('../Data/MiNNLO/showered/skipped_files.txt', 'w') as outfile:
  outfile.write('\n'.join(str(i) for i in skipped))


In [17]:
# load showered .npz events 

# X0 NLO hvq
X0_dir = '/tf/data/BachelorThesis_Data/showered/hvq'
X0_lhe_dir = '/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new'
num_train = 30 # number of lhe files we want to use

shower_tt_list = [''] * num_train
shower_top_list = [''] * num_train
lhe_list = [''] * num_train
for i in range(num_train):
    shower_tt_list[i] = f'{X0_dir}/ShowerTT_seed{i+1}.npz'
    shower_top_list[i] = f'{X0_dir}/ShowerTop_seed{i+1}.npz'
    # find lhe file
    for root, dirs, files in os.walk(X0_lhe_dir):
        for name in files:
            if f'{root}/{name}' == f'{X0_lhe_dir}/Results{i+1}/pwgevents.lhe':
                lhe_list[i] = f'{root}/{name}'

print(*shower_tt_list[:-1:10], sep='\n')
print(*shower_top_list[:-1:10], sep='\n')
print(*lhe_list[:-1:10], sep='\n')


/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_seed1.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_seed11.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_seed21.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_seed1.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_seed11.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_seed21.npz
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results1/pwgevents.lhe
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results11/pwgevents.lhe
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results21/pwgevents.lhe


In [19]:
X0_1, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[:10], shower_top_list[:10], lhe_list[:10], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(X0_1) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_01.npy', X0_1)

with open('../Data/MiNNLO/showered/skipped_files_01.txt', 'w') as outfile:
  outfile.write('\n'.join(str(i) for i in skipped))


skipped = []cessing file 0010
np.shape(x0_1) = (20000000, 3, 9)


In [21]:
X0_2, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[10:20], shower_top_list[10:20], lhe_list[10:20], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(X0_2) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_02.npy', X0_2)

with open('../Data/MiNNLO/showered/skipped_files_02.txt', 'w') as outfile:
  outfile.write('\n'.join(str(i) for i in skipped))
    

different number of events in files! 
tt : len: 1064007 | file: /tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_seed18.npz 
top: len: 1064007 | file: /tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_seed18.npz 
lhe: len: 1064006 | file: /tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results18/pwgevents.lhe 
skipping files 8
skipped = ['/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results18/pwgevents.lhe']
np.shape(x0_2) = (18000000, 3, 9)


In [22]:
X0_3, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[20:30], shower_top_list[20:30], lhe_list[20:30], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(X0_3) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_03.npy', X0_3)

with open('../Data/MiNNLO/showered/skipped_files_03.txt', 'w') as outfile:
  outfile.write('\n'.join(str(i) for i in skipped))
    

skipped = []cessing file 0010
np.shape(x0_3) = (20000000, 3, 9)


# create normalized datasets

In [23]:
data_dir = '../Data'


In [24]:
print(f'X0 POWHEG shape: {X0_1.shape}')

# normalize data
X0_nrm, nrm_array = DCTR.normalize_data(X0_1)


x0 POWHEG shape: (20000000, 3, 9)


In [25]:
# check nrm_array against unshowered
print('\n\
    showered nrm array:')
print(*np.array(nrm_array), sep = '\n')
old_nrm_array = np.load(f'{data_dir}/POWHEG_hvq/13TeV/01-02_nrm_array.npz')['a']

print('\n\
    old unshowered nrm array:')
print(*old_nrm_array, sep = '\n')

print('\n\
    absolute difference between nrm arrays:')
print(np.absolute(nrm_array - old_nrm_array))



    showered nrm array:
[[ 3.68369168e+00  1.06112132e+00  1.00000000e+00]
 [ 7.08754186e-06  1.03498469e+00  0.00000000e+00]
 [-2.72177939e-04  1.81367905e+00  0.00000000e+00]
 [ 6.21529518e+00  2.75704736e-01  1.00000000e+00]
 [-2.38507705e-04  3.28245327e+00  0.00000000e+00]
 [ 6.63661480e+00  4.74529020e-01  1.00000000e+00]]
[[ 4.59578140e+00  7.09849452e-01  1.00000000e+00]
 [-3.64950627e-05  1.21198813e+00  0.00000000e+00]
 [ 1.92076137e-04  1.81349380e+00  0.00000000e+00]
 [ 1.71937692e+02  6.96427067e+00  0.00000000e+00]
 [ 2.62986637e-05  1.86665124e+00  0.00000000e+00]
 [-1.38155106e+01  1.56319402e-13  1.00000000e+00]]
[[ 4.59271086e+00  7.15799813e-01  1.00000000e+00]
 [ 7.12895509e-05  1.20624271e+00  0.00000000e+00]
 [ 1.74905783e-04  1.81404673e+00  0.00000000e+00]
 [ 1.71938232e+02  6.94523213e+00  0.00000000e+00]
 [ 3.15156027e-04  1.86482958e+00  0.00000000e+00]
 [-1.38155106e+01  1.56319402e-13  1.00000000e+00]]

    old unshowered nrm array:
[[3.65206736e+00 1.0123

In [26]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_01.npy', X0_nrm)

print('X0_nrm saved')
np.save(f'{data_dir}/POWHEG_hvq/showered/norm_array_lhe_01.npy', nrm_array)


x0_nrm saved


In [27]:
print(f'X0 POWHEG shape: {X0_2.shape}')

# normalize data
X0_nrm, _ = DCTR.normalize_data(X0_2, nrm_array)


x0 POWHEG shape: (18000000, 3, 9)


In [28]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_02.npy', X0_nrm)

print('X0_nrm saved')


x0_nrm saved


In [29]:
print(f'X0 POWHEG shape: {X0_3.shape}')

# normalize data
X0_nrm, _ = DCTR.normalize_data(X0_3, nrm_array)


x0 POWHEG shape: (20000000, 3, 9)


In [30]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_03.npy', X0_nrm)

print('X0_nrm saved')


x0_nrm saved


In [31]:
print(f'X1 MiNNLO shape: {X1.shape}')

# normalize data
X1_nrm, _ = DCTR.normalize_data(X1, nrm_array)


X1 MiNNLO shape: (9543943, 3, 9)


In [32]:
# save 
np.save(f'{data_dir}/MiNNLO/showered/normed_lhe.npy', X1_nrm)

print('X1_nrm saved')


x1_nrm saved
