In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import anatools.data as data
import anatools.analysis as ana
from sklearn.model_selection import train_test_split

from hhdm_analysis.xgb.controllers import XGBLearner, XGBModel

ana.start()
plt.style.use("default")

# Setup config

In [2]:
period = '18'
year_style = 2018
dataset_year = "2018"
basedir = '/home/gamoreir/SanDisk/physics/hhdmAnalysis/datasets'
dataset_name = basedir.split('/')[-2]

# Setup output folders
data_path = f"./data/{dataset_name}/{dataset_year}"
Path(data_path).mkdir(parents=True, exist_ok=True)

# Read metadata

In [3]:
with open("../metadata.json", "r") as f:
    metadata = json.load(f)

ST = metadata.get("datasets").get("ST")
TT = metadata.get("datasets").get("TT")
ZZ = metadata.get("datasets").get("ZZ")
WZ = metadata.get("datasets").get("WZ")
DY = metadata.get("datasets").get("DY")
RESIDUAL = metadata.get("datasets").get("RESIDUAL")
DATA = metadata.get("datasets").get("DATA")

# Read data

In [4]:
variables = ["RegionID", "evtWeight", "LeadingLep_pt", "LepLep_pt", "LepLep_deltaR", "LepLep_deltaM", "MET_pt", "MET_LepLep_Mt", "MET_LepLep_deltaPhi", "TrailingLep_pt", "MT2LL", "Nbjets"]
ds = data.read_files(basedir, period, mode="normal", features=variables)

data.join_datasets(ds, "ST", ST.get(period), mode="normal")
data.join_datasets(ds, "TT", TT.get(period), mode="normal")
data.join_datasets(ds, "ZZ", ZZ.get(period), mode="normal")
data.join_datasets(ds, "WZ", WZ.get(period), mode="normal")
data.join_datasets(ds, "DYJetsToLL", DY.get(period), mode="normal")
data.join_datasets(ds, "Residual", RESIDUAL.get(period), mode="normal")
data.join_datasets(ds, "Data", DATA.get(period), mode="normal")

print("Signal_400_100", ds["Signal_400_100"].shape)
print("Signal_1000_100", ds["Signal_1000_100"].shape)
print("ST", ds["ST"].shape)
print("TT", ds["TT"].shape)
print("ZZ", ds["ZZ"].shape)
print("WZ", ds["WZ"].shape)
print("DYJetsToLL", ds["DYJetsToLL"].shape)
print("Residual", ds["Residual"].shape)
print("Data", ds["Data"].shape)

del ds["Data"]


Loading datasets...


100%|███████████████████████████████████████████████████████████████████████████████████| 76/76 [00:05<00:00, 14.19it/s]


Signal_400_100 (170831, 12)
Signal_1000_100 (233290, 12)
ST (67082, 12)
TT (1924246, 12)
ZZ (3514699, 12)
WZ (33700, 12)
DYJetsToLL (6975315, 12)
Residual (1536837, 12)
Data (825992, 12)


# Pre-process datasets

* Filter Signal Region
* XGBoost do not handle negative instance weights (so we remove then)
* Label Background as 0 and Signal as 1
* Weights are normalized (after splitting into train and test) in order to give the same importance for Signal and Background while minimizing the loss function.

In [13]:
# Used for scaling the weights (this helps xgboost)
scale_factor = 10**6

# Filtering SR
ds = {k: v[v.RegionID == 0] for k,v in ds.items()}

# Selecting backgrounds and signals
model_name = "XGB_Signal_400_100"
signal_name = "Signal_400_100"
signal = ds[signal_name]
backgrounds = [ds["ST"], ds["TT"], ds["ZZ"], ds["WZ"], ds["DYJetsToLL"], ds["Residual"]]

# Remove negative values
backgrounds = [bkg[bkg.evtWeight >= 0] for bkg in backgrounds]
signal = signal[signal.evtWeight >= 0]

# Labeling
signal.loc[:, "Label"] = 1
for bkg in backgrounds:
    bkg.loc[:, "Label"] = 0

# Spliting
backgrounds = [train_test_split(bkg, test_size=0.3, random_state=42) for bkg in backgrounds]
X_train_signal, X_test_signal = train_test_split(signal, test_size=0.3, random_state=42)

# Group backgrounds
X_train_bkgs = pd.concat([X_train_bkg for X_train_bkg, _ in backgrounds])
X_test_bkgs = pd.concat([X_test_bkg for _, X_test_bkg in backgrounds])

# Normalize background weights
X_train_bkgs["modelWeight"] = scale_factor*(X_train_bkgs.evtWeight/X_train_bkgs.evtWeight.sum())
X_test_bkgs["modelWeight"] = scale_factor*(X_test_bkgs.evtWeight/X_test_bkgs.evtWeight.sum())

# Normalize signal weights
X_train_signal["modelWeight"] = scale_factor*(X_train_signal.evtWeight/X_train_signal.evtWeight.sum())
X_test_signal["modelWeight"] = scale_factor*(X_test_signal.evtWeight/X_test_signal.evtWeight.sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# Group signal and background

In [14]:
# Group signal and background
X_train = pd.concat([X_train_signal, X_train_bkgs]).reset_index(drop=True)
X_test = pd.concat([X_test_signal, X_test_bkgs]).reset_index(drop=True)

# Normalize weights
X_train["modelWeight"] = scale_factor*(X_train.modelWeight/X_train.modelWeight.sum())
X_test["modelWeight"] = scale_factor*(X_test.modelWeight/X_test.modelWeight.sum())

# Shuffle
X_train = X_train.sample(frac=1).reset_index(drop=True)
X_test = X_test.sample(frac=1).reset_index(drop=True)

In [15]:
print(X_train.shape)
X_train.head()

(2457371, 14)


Unnamed: 0,LeadingLep_pt,LepLep_deltaM,LepLep_deltaR,LepLep_pt,MET_LepLep_Mt,MET_LepLep_deltaPhi,MET_pt,MT2LL,Nbjets,RegionID,TrailingLep_pt,evtWeight,Label,modelWeight
0,118.219742,16.727974,1.865665,115.298645,309.480621,2.773204,214.882675,121.749962,1,0,36.648872,0.005811,1,10.225851
1,86.122032,10.482124,1.407866,130.248779,199.10881,3.028722,76.336334,96.877045,1,0,59.314102,0.026212,0,0.062488
2,117.953575,10.43866,1.476635,125.647324,337.633087,2.953659,228.832214,125.939247,1,0,29.772533,0.005303,1,9.33202
3,93.209534,0.568504,2.360109,76.788666,138.952026,2.037838,86.688362,54.29319,1,0,26.384375,0.001638,0,0.003906
4,43.329895,20.864021,2.234646,45.388794,138.749039,1.847677,166.544891,68.943497,1,0,25.141319,0.029824,0,0.071098


In [16]:
print(X_test.shape)
X_test.head()

(1053165, 14)


Unnamed: 0,LeadingLep_pt,LepLep_deltaM,LepLep_deltaR,LepLep_pt,MET_LepLep_Mt,MET_LepLep_deltaPhi,MET_pt,MT2LL,Nbjets,RegionID,TrailingLep_pt,evtWeight,Label,modelWeight
0,93.097275,0.027267,1.374939,120.12011,101.016167,1.353405,54.155746,48.35722,1,0,52.50156,0.227285,0,1.267609
1,102.865639,1.667076,2.062619,105.710144,175.437637,2.677288,76.85788,62.279606,1,0,20.111681,0.157309,0,0.877338
2,83.709175,22.379219,1.601908,95.557968,198.641464,2.438401,117.123535,64.633224,1,0,21.648035,0.034425,0,0.191993
3,114.047249,8.643394,1.09071,167.813721,306.843719,2.660169,148.716064,145.504089,2,0,81.15834,0.001708,0,0.009526
4,218.087784,23.888603,0.384864,353.988007,177.581558,0.965898,103.270355,75.844833,1,0,141.593185,0.023674,0,0.132033


# Save processed data

In [17]:
# Pop unused variables at this point
X_train.pop("RegionID")
X_test.pop("RegionID")
# X_train.pop("evtWeight")
# X_test.pop("evtWeight")

X_train.to_csv(f"{data_path}/{model_name}-train-data.csv", index=False)
X_test.to_csv(f"{data_path}/{model_name}-test-data.csv", index=False)