In [1]:
%load_ext autoreload
%autoreload 2

import autorootcwd  # Do not delete - adds the root of the project to the path

In [2]:
# Read the config file
from data_processing.processing import read_yaml

config = read_yaml("config")

In [3]:
import os
from data_processing.processing import ConfigParser

# Create the config parser
parser = ConfigParser(config["trex_config"])

# We can access important properties of the config file
print(f"{parser.ntuple_base_path=}")
print(f"{parser.ntuple_name=}")
print(f"{parser.regions=}")
print(f"{parser.samples=}")

# Automatically substituted from the replacement file
print(f"{parser.luminosity=}")
print(f"{parser.weight_expr('ttW')=}")

region = config["region"] # Our region of interest
print(f"{parser.cut_expr(region)=}")
print(f"{parser.cut_features(region)=}") # Print all the features used for the preselection

parser.ntuple_base_path='/eos/atlas/atlascerngroupdisk/phys-higgs/HSG8/multilepton_ttWttH/v08/v0801/systematics-full/nominal/'
parser.ntuple_name='nominal'
parser.regions={'lep-pt-0': {'BLOCK_NAME': 'lep-pt-0', 'Type': 'SIGNAL', 'Variable': ['lep_Pt_0/1e3', 10, 0, 200], 'VariableTitle': 'Leading lepton p_{T} [GeV]', 'Label': '2lSS 1#tau 4j 1b', 'ShortLabel': 'SR - 2lSS 1#tau', 'TexLabel': '\\mathbf{2lSS 1\\tau 4j 1b}', 'Selection': 'XXX_TRIGGER_SELECTION && XXX_2LEPTON_SS_SELECTION && XXX_LEPTON_PROMPT_SELECTION && nTaus_OR==1 && nJets_OR_DL1r_85>=1 && nJets_OR>=4 && XXX_EXCLUSION_Z_PEAK', 'LogScale': 'FALSE', 'Ymin': '0.01', 'DataType': 'DATA'}}
parser.samples={'ttH': {'BLOCK_NAME': 'ttH', 'Type': 'SIGNAL', 'Title': 't#bar{t}H', 'TexTitle': '$t\\bar{t}H$', 'Group': 't#bar{t}H', 'FillColor': '2', 'LineColor': '1', 'NtupleFiles': 'XXX_ttH_samples', 'Selection': 'XXX_TAU_PROMPT && XXX_LEP_PROMPT'}, 'ttW': {'BLOCK_NAME': 'ttW', 'Type': 'BACKGROUND', 'Title': 't#bar{t}W', 'TexTitle': '$t\\

In [16]:
files = parser.files_by_process(full_path=False)
print("\\begin{itemize}")
for sample in config["samples"]:
    print(f"\\item \\textbf{{{sample}}}: ", end="")
    for file in files[sample]:
        print(f"\\texttt{{{file}}}, ", end="")
    print()
print("\\end{itemize}")

\begin{itemize}
\item \textbf{ttH}: \texttt{mc16a/p4498/346343}, \texttt{ mc16a/p4498/346344}, \texttt{ mc16a/p4498/346345}, \texttt{ mc16d/p4498/346343}, \texttt{ mc16d/p4498/346344}, \texttt{ mc16d/p4498/346345}, \texttt{ mc16e/p4498/346343}, \texttt{ mc16e/p4498/346344}, \texttt{ mc16e/p4498/346345}, 
\item \textbf{ttW}: \texttt{mc16a/p4416/700168}, \texttt{ mc16d/p4416/700168}, \texttt{ mc16e/p4416/700168}, 
\item \textbf{ttW_EW}: \texttt{mc16a/p4590/700205}, \texttt{ mc16d/p4590/700205}, \texttt{ mc16e/p4590/700205}, 
\item \textbf{ttZ}: \texttt{mc16a/p4416/504330}, \texttt{ mc16a/p4416/504334}, \texttt{ mc16a/p4416/504342}, \texttt{ mc16d/p4416/504330}, \texttt{ mc16d/p4416/504334}, \texttt{ mc16d/p4416/504342}, \texttt{ mc16e/p4416/504330}, \texttt{ mc16e/p4416/504334}, \texttt{ mc16e/p4416/504342}, 
\item \textbf{ttbar}: \texttt{mc16a/p4308/410470}, \texttt{ mc16d/p4308/410470}, \texttt{ mc16e/p4308/410470}, 
\item \textbf{VV}: \texttt{mc16a/p4416/364250}, \texttt{ mc16a/p4416/

In [5]:
# We need to select which features we are interested in.
# Among these features there are object features - nested arrays of variable length.
# We provide those lists in the features.txt and object_features.txt files.
train_features = read_yaml(config["features"])
object_features = read_yaml(config["array_features"])

cut_features = parser.cut_features(region)                # Read the features used in the preselection

train_features = list(set(train_features) | set(cut_features))
train_features.sort()

# Print them
max_len = max([len(f) for f in train_features])
for feature in train_features:
    print(f"{feature:>{max_len}s}{' (object)' if feature in object_features else ''}{(' (cut + train)' if feature not in train_features else ' (cut only)') if feature in cut_features else ''}")

                                  DRjj_lead
                                     DRll01
                      DeltaR_max_lep_bjet77
                         DeltaR_min_lep_jet
                     DeltaR_min_lep_jet_fwd
                                         HT
                                 HT_fwdJets
                             HT_inclFwdJets
                                    HT_jets
                                     HT_lep
                                    HT_taus
                                    MLepMet
                                        Mb1
                                        Mlb
                                      Mll01 (cut only)
                                    Mlll012
                                  Mllll0123
                                  MtLep1Met
                                   MtLepMet
                                     Ptll01
                                 best_Z_Mll
                           best_Z_other_Mll
                     

In [7]:
from data_processing.processing import read_region

# Open all the files and read data into awkward array. Also read the weight. Then convert awkward arrays to numpy
nested_size = 6
data = read_region(region,
                   parser,
                   train_features,
                   object_features,
                   nested_size=nested_size,
                   samples=["ttH"])
                #    samples=config["samples"])

Using selection from ttH sample


ttH: 100%|██████████| 9/9 [00:09<00:00,  1.10s/it]


Concatenating data from 1 processes
['taus_fromPV_0', 'DRjj_lead', 'nJets_OR', 'lep_nTrackParticles_1', 'minDeltaR_LJ_0', 'minOSMll', 'lep_nInnerPix_1', 'mjjMax_frwdJet', 'lep_ID_0', 'lep_Eta_1', 'DeltaR_max_lep_bjet77', 'max_eta', 'HT_jets', 'lep_RadiusCO_1', 'lep_chargeIDBDTResult_recalc_rel207_tight_0', 'minDeltaR_LJ_2', 'lep_Phi_0', 'taus_charge_0', 'lep_sigd0PV_0', 'lep_Z0SinTheta_1', 'nFwdJets_OR', 'nJets_OR_DL1r_85', 'lep_Eta_0', 'lep_Z0SinTheta_0', 'taus_eta_0', 'lep_isTightLH_0', 'lep_isolationLoose_VarRad_0', 'taus_width_0', 'nTaus_OR_Pt25', 'minDeltaR_LJ_1', 'DRll01', 'lep_isMedium_1', 'lep_isTightLH_1', 'nTaus_OR', 'lep_Mtrktrk_atPV_CO_0', 'lep_isMedium_0', 'flag_JetCleaning_LooseBad', 'custTrigMatch_LooseID_FCLooseIso_SLTorDLT', 'lep_Pt_1', 'best_Z_other_Mll', 'Mll01', 'lep_Mtrktrk_atConvV_CO_0', 'lep_ambiguityType_1', 'passPLIVTight_1', 'DeltaR_min_lep_jet_fwd', 'lep_EtaBE2_0', 'lep_E_0', 'taus_decayMode_0', 'taus_passEleOLR_0', 'taus_passJVT_0', 'eta_frwdjet', 'sumPsbtag

In [8]:
print(f"- Raw: {data.selected.sum()}")
print(f"- Weighted: {data.w[data.selected].sum()}")

- Raw: 15293
- Weighted: 12.219593768734246


In [9]:
from data_processing.processing import process_data

processed_data = process_data(data, read_yaml(config["categorical_features"]), read_yaml(config["invalid_values"]))
print(processed_data)

Casting categorical features to int


Setting invalid values to NaN: 100%|██████████| 3/3 [00:00<00:00, 111.20it/s]
Remapping categorical values: 100%|██████████| 15/15 [00:01<00:00,  9.01it/s]


ProcessedData(x_categorical=array([[0, 0, 2, ..., 1, 0, 0],
       [3, 2, 3, ..., 1, 1, 0],
       [0, 3, 2, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 1, 0],
       [3, 1, 1, ..., 1, 1, 0],
       [0, 1, 4, ..., 1, 1, 0]]), x_continuous=array([[ 0.17283437,  1.9655886 , -1.1318482 , ..., -0.5983114 ,
         1.0931675 , -0.46944162],
       [ 0.21584857,  0.6836461 ,  0.29361293, ..., -1.1430243 ,
        -0.68236023,         nan],
       [ 0.07322147,  0.6836461 , -1.1318482 , ...,  1.5718334 ,
         0.17994665,         nan],
       ...,
       [ 0.25622228, -1.2392677 ,         nan, ...,  0.63650024,
        -0.32565463,         nan],
       [ 0.11676958,  0.04267483, -0.41911766, ...,  0.53737384,
        -1.5578518 , -1.5633839 ],
       [ 0.32225856, -0.5982964 ,         nan, ..., -0.6154019 ,
                nan,         nan]], dtype=float32), y=array([0, 0, 0, ..., 0, 0, 0]), w=array([0.        , 0.00139961, 0.        , ..., 0.00037856, 0.00050356,
       0.       

In [10]:
# Save all of it
from data_processing.processing import save_data

output_path = os.path.join("data_processing", config["output_path"])
save_data(processed_data, output_path)

In [11]:
# Load and test it
from data_processing.processing import load_data

data = load_data(output_path)

total_raw = 0
total_weighted = 0
total_selected_raw = 0
total_selected_weighted = 0

for y, y_name in enumerate(data.y_names):
    w = data.w[data.y == y]
    num_raw = w.shape[0]
    num_weighted = w.sum()

    selected_w = data.w[(data.y == y) & data.selected]
    num_selected_raw = selected_w.shape[0]
    num_selected_weighted = selected_w.sum()

    print(f"{y_name:>10s}: {num_raw:>10d} raw {num_weighted:>12.4f} weighted {num_selected_raw:>10d} raw {num_selected_weighted:>12.4f} weighted")

    total_raw += num_raw
    total_weighted += num_weighted
    total_selected_raw += num_selected_raw
    total_selected_weighted += num_selected_weighted


print(f"{'Total':>10s}: {total_raw:>10d} raw {total_weighted:>12.4f} weighted {total_selected_raw:>10d} raw {total_selected_weighted:>12.4f} weighted")

       ttH:     834970 raw     523.4175 weighted      15293 raw      12.2196 weighted
     Total:     834970 raw     523.4175 weighted      15293 raw      12.2196 weighted
