In [1]:
import json
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import anatools.analysis as ana

from hhdm_analysis.xgb.controllers import XGBLearner, XGBModel

ana.start()
plt.style.use("default")

# Setup config

In [2]:
period = 'APV_16'
year_style = 2016
dataset_year = "APV_2016"
basedir = '/home/gamoreir/SanDisk/physics/hhdmAnalysis_deepJet_Regions/datasets'
dataset_name = basedir.split('/')[-2]

# Data folder
data_path = f"./data/{dataset_name}/{dataset_year}"

# Setup output folders
models_path = f"./models/{dataset_name}/{dataset_year}"
Path(models_path).mkdir(parents=True, exist_ok=True)

# Read data

In [3]:
model_name = "XGB_multi_signal"

X_train = pd.read_csv(f"{data_path}/{model_name}-train-data.csv")
X_test = pd.read_csv(f"{data_path}/{model_name}-test-data.csv")
W_train = X_train.pop("modelWeight").values
W_test = X_test.pop("modelWeight").values
Y_train = X_train.pop("Label").values
Y_test = X_test.pop("Label").values
evtWeight_train = X_train.pop("evtWeight").values
evtWeight_test = X_test.pop("evtWeight").values

In [4]:
X_train.head()

Unnamed: 0,LeadingLep_pt,LepLep_deltaM,LepLep_deltaR,LepLep_pt,MET_LepLep_Mt,MET_LepLep_deltaPhi,MET_pt,MT2LL,Nbjets,TrailingLep_pt
0,67.43445,0.40641,1.489798,99.975555,181.92041,3.033582,82.99966,90.96022,1,63.790764
1,293.91672,0.598915,0.393085,470.74545,268.21515,1.800546,62.23665,126.52729,1,180.17274
2,100.05968,1.220863,0.965787,184.397,118.42452,1.118869,67.50872,57.55273,2,85.24776
3,188.21326,1.586067,0.688611,267.78235,102.35561,0.827248,60.545155,26.82146,1,93.01645
4,229.02917,0.291946,0.444957,410.39297,600.294,2.569872,238.47949,297.66135,1,181.79301


In [5]:
W_train

array([0.8191494 , 0.13844149, 0.8200503 , ..., 0.08255574, 1.80496039,
       0.00693738])

In [6]:
Y_train

array([0, 0, 0, ..., 0, 0, 0])

# Train

In [7]:
params = {
    "n_estimators": 500,
    "min_child_weight": 4,
    "learning_rate": 0.01,
    "subsample": 0.60,
    "colsample_bytree": 0.80,
    "max_depth": 5,
    "eval_metric": "error"
}

# Or read from json
# params = json.load(open(f"{models_path}/{signal_name}-bst-params.json"))

num_boost_round = params.pop('n_estimators')
features = ["LeadingLep_pt", "LepLep_pt", "LepLep_deltaR", "LepLep_deltaM", "MET_pt", "MET_LepLep_Mt", "MET_LepLep_deltaPhi", "TrailingLep_pt", "MT2LL", "Nbjets"]

learner = XGBLearner(X_train, Y_train, W_train, features)
learner.train(params, num_boost_round)
learner.save_model(f"{models_path}/{model_name}-clf.model")
learner.dump_model(f"{models_path}/{model_name}-raw.model")

# Notify end of notebook

In [8]:
from IPython.display import clear_output, display, HTML, Javascript

display(Javascript("""
  var msg = new SpeechSynthesisUtterance();
  msg.text = "Process completed!";
  window.speechSynthesis.speak(msg);
  alert("Process completed!")
"""))

<IPython.core.display.Javascript object>