In [1]:
import pprint
import json
from pathlib import Path
import pickle

from tqdm import tqdm
import anatools.data as data
import tensorflow as tf
from statsmodels.stats.weightstats import DescrStatsW
from tensorflow.keras.models import load_model

from hhdm_analysis.xgb.controllers import XGBLearner, XGBModel

# Disable GPUs
tf.config.set_visible_devices([], 'GPU')

2023-02-16 19:58:48.612238: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2023-02-16 19:58:52.480845: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-16 19:58:52.489548: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-02-16 19:58:53.156582: E tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:927] could not open file to read NUMA node: /sys/bus/pci/devices/0000:03:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-16 19:58:53.156882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:03:00.0 name: NVIDIA GeForce GTX 1660 computeCapability: 7.5
coreClock: 1.8GHz coreCount: 22 deviceMemorySize: 6.00GiB deviceMemoryBandwidth: 178.86GiB/s
2023-02-16 19:58:53.157019: I tensorflow/stream_executor/platform

# Setup config

In [2]:
period = '18'
year_style = 2018
dataset_year = "2018"
basedir = "/home/gamoreir/SanDisk/physics/hhdmAnalysis_deepJet_Regions/datasets"

# Data folder
dataset_name = basedir.split('/')[-2]
data_path = f"./data/{dataset_name}/{dataset_year}"
Path(data_path).mkdir(parents=True, exist_ok=True)
    
# Setup models folders
models_path = f"./models/{dataset_name}/{dataset_year}"
Path(models_path).mkdir(parents=True, exist_ok=True)

# Read metadata

In [3]:
with open("../metadata.json", "r") as f:
    metadata = json.load(f)

ST = metadata.get("datasets").get("ST")
TT = metadata.get("datasets").get("TT")
ZZ = metadata.get("datasets").get("ZZ")
WZ = metadata.get("datasets").get("WZ")
DY = metadata.get("datasets").get("DY")
RESIDUAL = metadata.get("datasets").get("RESIDUAL")
DATA = metadata.get("datasets").get("DATA")

# Load datasets

In [4]:
variables = ["RegionID", "evtWeight", "MLP_score_torch", "LeadingLep_pt", "LepLep_pt", "LepLep_deltaR", "LepLep_deltaM", "MET_pt", "MET_LepLep_Mt", "MET_LepLep_deltaPhi", "TrailingLep_pt", "MT2LL", "Nbjets"]
ds = data.read_files(basedir, period, mode="normal", features=variables)

data.join_datasets(ds, "ST", ST.get(period), mode="normal")
data.join_datasets(ds, "TT", TT.get(period), mode="normal")
data.join_datasets(ds, "ZZ", ZZ.get(period), mode="normal")
data.join_datasets(ds, "WZ", WZ.get(period), mode="normal")
data.join_datasets(ds, "DYJetsToLL", DY.get(period), mode="normal")
data.join_datasets(ds, "Residual", RESIDUAL.get(period), mode="normal")

# Datasets to be used
used_datasets = [
    *[dt for dt in ds.keys() if dt.startswith("Signal_")],
    "ST",
    "TT",
    "ZZ",
    "WZ",
    "DYJetsToLL",
    "Residual"
]

for dt_name in used_datasets:
    print(dt_name, ds[dt_name].shape)

# Delete every other dataset
datasets_to_delete = [dt_name for dt_name in ds.keys() if dt_name not in used_datasets]
for dt_name in datasets_to_delete:
    del ds[dt_name]


Loading datasets...


100%|██████████| 73/73 [00:06<00:00, 11.27it/s]


Signal_1000_100 (184662, 13)
Signal_1000_200 (155516, 13)
Signal_1000_300 (163180, 13)
Signal_1000_400 (174503, 13)
Signal_1000_600 (47113, 13)
Signal_1000_800 (148510, 13)
Signal_400_100 (112655, 13)
Signal_400_200 (35615, 13)
Signal_500_100 (130495, 13)
Signal_500_200 (140136, 13)
Signal_500_300 (118287, 13)
Signal_600_100 (134052, 13)
Signal_600_200 (156038, 13)
Signal_600_300 (145565, 13)
Signal_600_400 (128733, 13)
Signal_800_100 (156662, 13)
Signal_800_200 (148385, 13)
Signal_800_300 (160871, 13)
Signal_800_400 (169710, 13)
Signal_800_600 (138418, 13)
ST (94330, 13)
TT (2647163, 13)
ZZ (1924672, 13)
WZ (24816, 13)
DYJetsToLL (5897214, 13)
Residual (537577, 13)


# Models metadata

In [5]:
base_model_name = "multi_signal"
features = [
    "LeadingLep_pt",
    "LepLep_deltaM",
    "LepLep_deltaR",
    "LepLep_pt",
    "MET_LepLep_Mt",
    "MET_LepLep_deltaPhi",
    "MET_pt",
    "MT2LL",
    "Nbjets",
    "TrailingLep_pt"
]

# Predict using XGB

In [6]:
# Load model
xgb_model = XGBModel(model_fpath=f"{models_path}/XGB_{base_model_name}-clf.model")

# Predict each dataset
for dataset_name, dataset in tqdm(ds.items()):
    X_features = dataset[features]
    Y_pred = xgb_model.predict(X_features, features)
    dataset["XGB_score"] = Y_pred

100%|██████████| 26/26 [00:21<00:00,  1.23it/s]


# Pedrict using MLP Keras

In [7]:
# Load model
mlp_model = load_model(f"{models_path}/MLP_{base_model_name}-checkpoint.h5")

# Load zscore stats
zscore = json.load(open(f"{data_path}/MLP_{base_model_name}-weighted_stats.json", "r"))

# Predict each dataset
for dataset_name, dataset in tqdm(ds.items()):
    X_features = dataset[features].copy()
    
    # Since the model was trained under processed data, we need to preprocess it to predict
    for feature in features:
        X_features.loc[:, feature] = (X_features[feature] - zscore[feature]["mean"]) / zscore[feature]["std"]

    Y_pred = mlp_model.predict(X_features, batch_size=256)
    dataset["MLP_score_keras"] = Y_pred

2023-02-16 19:59:22.071587: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-16 19:59:22.071867: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-02-16 19:59:22.071890: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      
  0%|          | 0/26 [00:00<?, ?it/s]2023-02-16 20:00:04.487995: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-02-16 20:00:04.490778: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2599995000 Hz
100%|██████████| 26/26 [1:07:35<00:00, 155.97s/it]


# Save predict datasets

Prediction plots will be made in another jupyter notebook for the sake of flexibility since Keras prediction is slow.

In [8]:
with open(f"{data_path}/{base_model_name}-predicted-data.pickle", "wb") as f:
    pickle.dump(ds, f)