In [1]:
# import system modules
import sys
import os
os.system('for a in /sys/bus/pci/devices/*; do echo 0 | tee -a $a/numa_node>/dev/null; done') # get rid of NUMA node warnings in my docker: https://github.com/tensorflow/tensorflow/issues/42738
import gc

# import standard numerical modules
import numpy as np
import math

# import machine learning modules
import tensorflow as tf
import keras.backend as K

gpu = tf.config.list_physical_devices('GPU') # make sure GPU usage is enabled
tf.config.experimental.set_virtual_device_configuration(gpu[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=int(7.5*1024))]) # make sure tensorflow doesn't use too much memorty and crashes
print(gpu) 

sys.path.append('../20240503')
import DCTR


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# define conversion function

In [3]:
from madgraph.various.lhe_parser import EventFile # madgraph EventFile for getting event weights from lhe

def convert_valentinas_datasets_to_mine(tt_npz_list, top_npz_list, lhe_list, theta):
    '''converts valentinas dataset layout and shape into mine.
    valentinas: tt:  [pt, y, eta, phi, mass, E]
                top: [pt, y, eta, phi, mass, PID]
    mine:            [pt, y, phi, mass, eta, E, PID, w, theta]
                     [0 , 1, 2  , 3   , 4  , 5, 6  , 7, 8    ]

    args:
        tt_npz_list

    no event weights! We need to load lhe to find event weight.
    '''
    assert len(tt_npz_list) == len(top_npz_list) == len(lhe_list), 'Lengths of lists must be equal!'

    # create mapping from valentinas to mine
    target_order  = ['pt', 'y', 'phi', 'mass', 'eta', 'E', 'PID', 'w', 'theta'] # my order
    org_order_tt  = ['pt', 'y', 'eta', 'phi', 'mass', 'E', 'PID', 'w', 'theta'] # tt org
    org_order_top = ['pt', 'y', 'eta', 'phi', 'mass', 'PID', 'E', 'w', 'theta'] # top org
    mapping_tt = [org_order_tt.index(col) for col in target_order]
    mapping_top = [org_order_top.index(col) for col in target_order]
    
    out = []
    skipped = []
    for i, _ in enumerate(tt_npz_list):
        # check if files exist
        if not os.path.exists(tt_npz_list[i]):
            print(f"file: {tt_npz_list[i]} doesn't exist; skipping")
            skipped.append(i+1)
            continue
        elif not os.path.exists(top_npz_list[i]):
            print(f"file: {top_npz_list[i]} doesn't exist; skipping")
            skipped.append(i+1)
            continue
        elif not os.path.exists(lhe_list[i]):
            print(f"file: {lhe_list[i]} doesn't exist; skipping")
            skipped.append(i+1)
            continue
        
        # get event weights from lhe file
        wgt = []
        lhe = EventFile(lhe_list[i]) # uses madgraphs EventFile function to open the lhe file
        for event in lhe:
            w = event.wgt
            wgt.append(w)
        # print(f'{np.shape(wgt) = }')
        
        # load tt-dataset
        tt = []
        tt = np.load(tt_npz_list[i])['a']
        # print(f'{np.shape(tt) = }')
        
        # load top (and anti-top) dataset
        top = []
        top = np.load(top_npz_list[i])['a']
        # print(f'{np.shape(top) = }')

        # check that lhe file and converted npz contain same number of events
        if len(tt) != len(wgt) != len(top):
            print(f'different number of events in files! \n' + 
                  f'tt : len: {len(tt)} | file: {tt_npz_list[i]} \n' +
                  f'top: len: {len(top)} | file: {top_npz_list[i]} \n' +
                  f'lhe: len: {len(wgt)} | file: {lhe_list[i]} \n' +
                  f'skipping files {i+1}')
            skipped.append(i+1)
            continue
        elif len(wgt) <= 10:
            print(f'file: {lhe_list[i]} only has {len(wgt)} events; skipping')
            skipped.append(i+1)
            continue
        

        # pad arrays into correct final shape
        tt = np.pad(tt, [(0,0), (0,0), (0, 3)])
        top = np.pad(top, [(0,0), (0,0), (0, 3)])   
        # print(f'shape after padding {np.shape(tt) = }')
        # print(f'shape after padding {np.shape(top) = }') 

        # re-sort datasets
        tt = tt[..., mapping_tt] 
        top = top[..., mapping_top]

        # concat to have tt-pair and top and anti-top for each event
        concat = []
        concat = np.concatenate((tt, top), axis=1)
        # print(f'{np.shape(concat) = }')
        
        # add wgt
        wgt = [wgt] * 3 # every particle has same wgt (as it is an event wgt)
        concat[..., 7] = np.transpose(wgt)
        
        # add theta
        theta_ = np.full_like(concat[..., 8], theta)
        concat[..., 8] = theta_
        
        # add to out
        if i == 0:
            out = concat
        else:
            out = np.concatenate((out, concat), axis = 0)
        print(f'finished processing file {i+1:0>4}', end='\r')
    
    return out, skipped


In [4]:
# load showered .npz events 

# x1 NNLO MiNNLO
x1_path = '/tf/data/BachelorThesis_Data/showered/MiNNLO/'
x1_lhe_path = '/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/'
num_train = 1000

shower_tt_list = [''] * num_train
shower_top_list = [''] * num_train
lhe_list = [''] * num_train
for i in range(num_train):
    shower_tt_list[i] = f'{x1_path}ShowerTTR_{i+1}_0.855.npz'
    shower_top_list[i] = f'{x1_path}ShowerTopR_{i+1}_0.855.npz'
    # find lhe file
    for root, dirs, files in os.walk(x1_lhe_path):
        for name in files:
            if name == f'pwgevents-{i+1:0>4}.lhe':
                lhe_list[i] = f'{root}/{name}'

print(*shower_tt_list[:-1:250], sep='\n')
print(*shower_top_list[:-1:250], sep='\n')
print(*lhe_list[:-1:250], sep='\n')


/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_1_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_251_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_501_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_751_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_1_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_251_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_501_0.855.npz
/tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_751_0.855.npz
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0001.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0251.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0501-1000/pwgevents-0501.lhe
/tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0501-1000/pwgevents-0751.lhe


In [8]:
x1, skipped = convert_valentinas_datasets_to_mine(shower_tt_list, shower_top_list, lhe_list, theta = 1)

print(f'{skipped = }')
print(f'{np.shape(x1) = }')


different number of events in files! 
tt : len: 7000 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_75_0.855.npz 
top: len: 7000 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_75_0.855.npz 
lhe: len: 6999 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0075.lhe 
skipping files 75
different number of events in files! 
tt : len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_107_0.855.npz 
top: len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_107_0.855.npz 
lhe: len: 7998 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0107.lhe 
skipping files 107
different number of events in files! 
tt : len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTTR_108_0.855.npz 
top: len: 7999 | file: /tf/data/BachelorThesis_Data/showered/MiNNLO/ShowerTopR_108_0.855.npz 
lhe: len: 7998 | file: /tf/data/BachelorThesis_Data/LHE/data/MiNNLO/0001-0500/pwgevents-0108.l

In [11]:
# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/MiNNLO/showered/converted_lhe.npy', x1)
np.save('../Data/MiNNLO/showered/skipped_files.npy', skipped)


In [20]:
# load showered .npz events 

# x0 NLO hvq
x0_path = '/tf/data/BachelorThesis_Data/showered/hvq/'
x0_lhe_path = '/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/'
num_train = 50

shower_tt_list = [''] * num_train
shower_top_list = [''] * num_train
lhe_list = [''] * num_train
for i in range(num_train):
    shower_tt_list[i] = f'{x0_path}ShowerTT_{i+1}.npz'
    shower_top_list[i] = f'{x0_path}ShowerTop_{i+1}.npz'
    # find lhe file
    for root, dirs, files in os.walk(x0_lhe_path):
        for name in files:
            if f'{root}/{name}' == f'{x0_lhe_path}Results{i+1}/pwgevents.lhe':
                lhe_list[i] = f'{root}/{name}'

print(*shower_tt_list[:-1:20], sep='\n')
print(*shower_top_list[:-1:20], sep='\n')
print(*lhe_list[:-1:20], sep='\n')


/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_1.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_21.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_41.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_1.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_21.npz
/tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_41.npz
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results1/pwgevents.lhe
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results21/pwgevents.lhe
/tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results41/pwgevents.lhe


In [21]:
x0_1, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[:10], shower_top_list[:10], lhe_list[:10], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(x0_1) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_01.npy', x0_1)
np.save('../Data/MiNNLO/showered/skipped_files_01.npy', skipped)


skipped = []cessing file 0009
np.shape(x1) = (8857945, 3, 9)


In [23]:
print(f'{skipped = }')
print(f'{np.shape(x0_1) = }')


skipped = []
np.shape(x0_1) = (20000000, 3, 9)


In [24]:
x0_2, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[11:20], shower_top_list[11:20], lhe_list[11:20], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(x0_2) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_02.npy', x0_2)
np.save('../Data/MiNNLO/showered/skipped_files_02.npy', skipped)


different number of events in files! 
tt : len: 1064007 | file: /tf/data/BachelorThesis_Data/showered/hvq/ShowerTT_18.npz 
top: len: 1064007 | file: /tf/data/BachelorThesis_Data/showered/hvq/ShowerTop_18.npz 
lhe: len: 1064006 | file: /tf/data/BachelorThesis_Data/LHE/data/POWHEG_new/Results18/pwgevents.lhe 
skipping files 7
skipped = [7]essing file 0008
np.shape(x0_2) = (16000000, 3, 9)


In [25]:
x0_3, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[21:30], shower_top_list[21:30], lhe_list[21:30], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(x0_3) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_03.npy', x0_3)
np.save('../Data/MiNNLO/showered/skipped_files_03.npy', skipped)


skipped = []cessing file 0008
np.shape(x0_3) = (18000000, 3, 9)


In [26]:
x0_4, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[31:40], shower_top_list[31:40], lhe_list[31:40], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(x0_4) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_04.npy', x0_4)
np.save('../Data/MiNNLO/showered/skipped_files_04.npy', skipped)


skipped = []cessing file 0008
np.shape(x0_4) = (18000000, 3, 9)


In [27]:
x0_5, skipped = convert_valentinas_datasets_to_mine(shower_tt_list[41:], shower_top_list[41:], lhe_list[41:], theta = 0)

print(f'{skipped = }')
print(f'{np.shape(x0_5) = }')

# save dataset to memory mappable npy array | takes more disk space than compressed, but allows partial reading from disk
np.save('../Data/POWHEG_hvq/showered/converted_lhe_05.npy', x0_5)
np.save('../Data/MiNNLO/showered/skipped_files_05.npy', skipped)


skipped = []cessing file 0008
np.shape(x0_5) = (18000000, 3, 9)


# create normalized datasets

In [4]:
data_dir = '../Data'


In [31]:
# load data 01
x0 = []
x0 = np.load(f'{data_dir}/POWHEG_hvq/showered/converted_lhe_01.npy')
print(f'x0 POWHEG shape: {x0.shape}')

# normalize data
x0_nrm, nrm_array = DCTR.normalize_data(x0)


x0 POWHEG shape: (20000000, 3, 9)


In [32]:
# check nrm_array against unshowered
print(*nrm_array, sep = '\n')
old_nrm_array = np.load(f'{data_dir}/POWHEG_hvq/13TeV/01-02_nrm_array.npz')['a']
print('\n')
print(*old_nrm_array, sep = '\n')

print(np.absolute(nrm_array - old_nrm_array))


[(3.683839756313715, 1.0609460007389058, True), (8.612103828713617e-06, 1.034977498713008, False), (-0.0004003776885255839, 1.8138344846642072, False), (6.215296416594436, 0.27569957542245294, True), (-9.040021626823053e-05, 3.2822364242329503, False), (6.636608323497679, 0.47452490624975824, True)]
[(4.595774290081871, 0.7097691197460952, True), (-3.8328073562234005e-05, 1.2119908934987196, False), (0.000304694309451561, 1.8134625978763672, False), (171.93769206352815, 6.964270672174379, False), (-5.563968293036282e-05, 1.866646625911125, False), (-13.81551055796443, 1.5631940186722204e-13, True)]
[(4.592728535455571, 0.7155761005838078, True), (7.53851788734336e-05, 1.2062537312547548, False), (-1.294039825749624e-05, 1.8140254480324653, False), (171.9382316455017, 6.9452321287801695, False), (0.0003029718978719232, 1.8647740777094741, False), (-13.81551055796443, 1.5631940186722204e-13, True)]


[[3.65206736e+00 1.01234024e+00 1.00000000e+00]
 [1.71881058e-04 1.03624555e+00 0.000000

In [33]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_01.npy', x0_nrm)

print('x0_nrm saved')
np.save(f'{data_dir}/POWHEG_hvq/showered/norm_array_lhe_01.npy', nrm_array)


x0_nrm saved


In [34]:
# load data 02
x0 = []
x0 = np.load(f'{data_dir}/POWHEG_hvq/showered/converted_lhe_02.npy')
print(f'x0 POWHEG shape: {x0.shape}')

# normalize data
x0_nrm, _ = DCTR.normalize_data(x0, nrm_array)


x0 POWHEG shape: (16000000, 3, 9)


In [35]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_02.npy', x0_nrm)

print('x0_nrm saved')


x0_nrm saved


In [36]:
# load data 03
x0 = []
x0 = np.load(f'{data_dir}/POWHEG_hvq/showered/converted_lhe_03.npy')
print(f'x0 POWHEG shape: {x0.shape}')

# normalize data
x0_nrm, _ = DCTR.normalize_data(x0, nrm_array)


x0 POWHEG shape: (18000000, 3, 9)


In [37]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_03.npy', x0_nrm)

print('x0_nrm saved')


x0_nrm saved


In [42]:
# load data 04
x0 = []
x0 = np.load(f'{data_dir}/POWHEG_hvq/showered/converted_lhe_04.npy')
print(f'x0 POWHEG shape: {x0.shape}')

# normalize data
x0_nrm, _ = DCTR.normalize_data(x0, nrm_array)


x0 POWHEG shape: (18000000, 3, 9)


In [43]:
# save 
np.save(f'{data_dir}/POWHEG_hvq/showered/normed_lhe_04.npy', x0_nrm)

print('x0_nrm saved')


x0_nrm saved


In [44]:
# x1 MiNNLO Dataset
# load data
x1 = []
x1 = np.load(f'{data_dir}/MiNNLO/showered/converted_lhe.npy')
print(f'x1 MiNNLO shape: {x1.shape}')

# normalize data
x1_nrm, _ = DCTR.normalize_data(x1, nrm_array)


x1 MiNNLO shape: (8857945, 3, 9)


In [45]:
# save 
np.save(f'{data_dir}/MiNNLO/showered/normed_lhe.npy', x1_nrm)

print('x1_nrm saved')


x1_nrm saved
