In [3]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import anatools.analysis as ana

from hhdm_analysis.xgb.controllers import XGBLearner, XGBModel

ana.start()
plt.style.use("default")

# Setup config

In [None]:
period = '18'
year_style = 2018
dataset_year = "2018"
basedir = '/home/gamoreir/SanDisk/physics/hhdmAnalysis/datasets'
dataset_name = basedir.split('/')[-2]

# Data folder
data_path = f"./data/{dataset_name}/{dataset_year}"

# Setup output folders
models_path = f"./models/{dataset_name}/{dataset_year}"
Path(models_path).mkdir(parents=True, exist_ok=True)

# Read data

In [None]:
model_name = "XGB_multi_signal"

X_train = pd.read_csv(f"{data_path}/{model_name}-train-data.csv")
X_test = pd.read_csv(f"{data_path}/{model_name}-test-data.csv")
W_train = X_train.pop("modelWeight").values
W_test = X_test.pop("modelWeight").values
Y_train = X_train.pop("Label").values
Y_test = X_test.pop("Label").values
evtWeight_train = X_train.pop("evtWeight").values
evtWeight_test = X_test.pop("evtWeight").values

In [None]:
X_train.head()

In [None]:
W_train

In [None]:
Y_train

# Grid search

In [8]:
# params = {
#     "n_estimators": 500,
#     "min_child_weight": 6,
#     "learning_rate": 0.5,
#     "subsample": 0.95,
#     "colsample_bytree": 0.55,
#     "max_depth": 9,
#     "eval_metric": "logloss"
# } score 0.833

params = {
    'n_estimators': [100, 500, 1000, 2000, 3000, 5000, 7000, 9000],
    'min_child_weight': range(1,10),
    'learning_rate': [1e-3, 1e-2, 1e-1, 0.3, 0.5, 0.7, 0.9, 1.],
    'subsample': np.arange(0.50, 1.01, 0.05),
    'colsample_bytree': np.arange(0.50, 1.01, 0.05),
    'max_depth': range(3,11),
    'eval_metric': ['logloss', 'error']
}

learner = XGBLearner(X_train, Y_train, W_train, njobs=12)
res = learner.find_hyperparams(hyperparams_grid=params, n_splits=5, n_iter=30)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 2/5] END colsample_bytree=0.55, eval_metric=logloss, learning_rate=0.5, max_depth=9, min_child_weight=6, n_estimators=500, subsample=0.9500000000000004;, score=0.831 total time=224.7min
[CV 1/5] END colsample_bytree=0.6000000000000001, eval_metric=error, learning_rate=0.001, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8500000000000003;, score=0.830 total time=280.3min




[CV 4/5] END colsample_bytree=0.55, eval_metric=logloss, learning_rate=0.5, max_depth=9, min_child_weight=6, n_estimators=500, subsample=0.9500000000000004;, score=0.832 total time=227.5min
[CV 3/5] END colsample_bytree=0.6000000000000001, eval_metric=error, learning_rate=0.001, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8500000000000003;, score=0.832 total time=279.7min
[CV 3/5] END colsample_bytree=0.55, eval_metric=logloss, learning_rate=0.5, max_depth=9, min_child_weight=6, n_estimators=500, subsample=0.9500000000000004;, score=0.833 total time=226.9min
[CV 2/5] END colsample_bytree=0.6000000000000001, eval_metric=error, learning_rate=0.001, max_depth=10, min_child_weight=3, n_estimators=500, subsample=0.8500000000000003;, score=0.830 total time=283.4min
[CV 1/5] END colsample_bytree=0.55, eval_metric=logloss, learning_rate=0.5, max_depth=9, min_child_weight=6, n_estimators=500, subsample=0.9500000000000004;, score=0.832 total time=224.0min
[CV 5/5] END colsamp

KeyboardInterrupt: 

# Save parameters

In [None]:
with open(f"{models_path}/{model_name}-bst-params.json", "w") as f:
    json.dump(res["hyperparameters"], f, ensure_ascii=False, indent=4)

# Notify end of notebook

In [None]:
from IPython.display import clear_output, display, HTML, Javascript

display(Javascript("""
  var msg = new SpeechSynthesisUtterance();
  msg.text = "Process completed!";
  window.speechSynthesis.speak(msg);
  alert("Process completed!")
"""))