In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import glob
import json
import gc

pd.options.display.max_rows = 10

## Implementation of metrics

In [2]:
from sklearn.metrics import roc_curve
from scipy.interpolate import interp1d

def rej_fixed_eff(truth, score, weight, efficiencies):
    fpr, tpr, thr = roc_curve(truth, score, sample_weight=weight)
    nonzero = (fpr != 0)
    eff = tpr[nonzero]
    rej = 1.0 / fpr[nonzero]
    
    interpol = interp1d(eff, rej, copy=False)
    return interpol(efficiencies)                    

In [3]:
tid_1pvars = ["TauJets.centFrac", "TauJets.etOverPtLeadTrk",
              "TauJets.innerTrkAvgDist", "TauJets.absipSigLeadTrk",
              "TauJets.SumPtTrkFrac", "TauJets.ChPiEMEOverCaloEME",
              "TauJets.EMPOverTrkSysP", "TauJets.ptRatioEflowApprox",
              "TauJets.mEflowApprox"]

## Define analysis

In [4]:
train_sample = pd.read_hdf("samples/train.h5", columns=tid_1pvars + ["weight", "is_sig"])

train_weights = train_sample["weight"].get_values()
train_is_sig = train_sample["is_sig"].get_values()
dtrain = xgb.DMatrix(train_sample[tid_1pvars], label=train_sample["is_sig"], weight=train_sample["weight"])

%xdel train_sample
gc.collect()

test_sample = pd.read_hdf("samples/test.h5", columns=tid_1pvars + ["weight", "is_sig"])

test_weights = test_sample["weight"].get_values()
test_is_sig = test_sample["is_sig"].get_values()
dtest = xgb.DMatrix(test_sample[tid_1pvars], label=test_sample["is_sig"], weight=test_sample["weight"])

%xdel test_sample
gc.collect()

162

In [5]:
def do_analysis(filename):
    # Collection of metrics
    ret = {}
    
    # Load stuff
    with open(filename) as f:
        model_desc = json.load(f)
    
    bst = xgb.Booster(model_file="models/{}.model".format(model_desc["identifier"]))
    ret["best_iteration"] = int(bst.attributes()["best_iteration"])
    
    ntree = 1000 if ret["best_iteration"] > 1000 else ret["best_iteration"]
    train_scores = bst.predict(dtrain, ntree_limit=ntree)
    test_scores = bst.predict(dtest, ntree_limit=ntree)
    
    rej30, rej50, rej70 = rej_fixed_eff(test_is_sig, test_scores, test_weights, [0.3, 0.5, 0.7])
    
    ret["id"] = model_desc["identifier"]
    ret["rej30"] = rej30
    ret["rej50"] = rej50
    ret["rej70"] = rej70

    
    ret.update(model_desc["config"])
    
    return ret

## Select processed files

In [6]:
processed = []
for model_desc in glob.glob("models/*.json"):
    with open(model_desc) as f:
        desc = json.load(f)
    if desc["processed"]:
        processed.append(model_desc)        

## Loop over analysis

In [None]:
rets = []

In [None]:
for model_desc in processed:
    ret = do_analysis(model_desc)
    print(ret)
    rets.append(ret)

In [18]:
x = pd.DataFrame(rets)

In [19]:
x.sort_values("rej50", ascending=False)

Unnamed: 0,alpha,best_iteration,colsample_bylevel,colsample_bytree,eta,gamma,id,lambda,max_delta_step,max_depth,min_child_weight,objective,rej30,rej50,rej70,scale_pos_weight,sketch_eps,subsample,tree_method
2,0,8775,1,1,0.01,0,2,1,0,8,2500.0,binary:logistic,351.085125,117.663566,44.810868,1,0.005,1,auto
1,0,9856,1,1,0.01,0,1,1,0,6,2500.0,binary:logistic,330.235617,112.548047,43.38682,1,0.005,1,auto
0,0,9998,1,1,0.01,0,0,1,0,4,2500.0,binary:logistic,303.019898,105.218275,41.181184,1,0.005,1,auto
