In [1]:
import numpy as np
import awkward as ak
import hist
import warnings
import pickle
from coffea.ml_tools.torch_wrapper import torch_wrapper
import matplotlib.pyplot as plt
import hist
from sklearn.metrics import roc_curve, auc
import math
import os
import uproot
import json

In [2]:
warnings.filterwarnings('ignore', 'invalid value')
warnings.filterwarnings('ignore', 'No format')

In [3]:
hgg = ak.from_parquet('/scratch365/cmoore24/training/data/ecfs/hgg_ecfs.parquet')
qcd = ak.firsts(ak.from_parquet('/scratch365/cmoore24/training/data/ecfs/qcd_ecfs.parquet'))

In [4]:
hgg = hgg[ak.flatten(hgg.msoftdrop < 200)]
hgg = hgg[ak.flatten(hgg.msoftdrop > 40)]
hgg = hgg[ak.flatten(hgg.pt < 600)]
hgg = hgg[ak.flatten(hgg.pt > 450)]

In [5]:
qcd = qcd[qcd.msoftdrop < 200]
qcd = qcd[qcd.msoftdrop > 40]
qcd = qcd[qcd.pt < 600]
qcd = qcd[qcd.pt > 450]

In [6]:
mask = ak.ones_like(hgg[hgg.fields[0]], dtype='bool')
mask = ak.fill_none(mask, True)
for j in hgg.fields:
    if hgg[j].fields == []:
        mask = mask & (~ak.is_none(ak.nan_to_none(hgg[j])))
    else:
        for i in hgg[j].fields:
            mask = mask & (~ak.is_none(ak.nan_to_none(hgg[j][i])))
hgg = hgg[ak.flatten(mask)]

In [7]:
mask = ak.ones_like(qcd[qcd.fields[0]], dtype='bool')
mask = ak.fill_none(mask, True)
for j in qcd.fields:
    if qcd[j].fields == []:
        mask = mask & (~ak.is_none(ak.nan_to_none(qcd[j])))
    else:
        for i in qcd[j].fields:
            mask = mask & (~ak.is_none(ak.nan_to_none(qcd[j][i])))
qcd = qcd[mask]

In [8]:
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal0.pkl', 'rb') as f:
    ecf_list0 = pickle.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal1.pkl', 'rb') as f:
    ecf_list1 = pickle.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal2.pkl', 'rb') as f:
    ecf_list2 = pickle.load(f)

In [9]:
with open('ratios0.pkl', 'rb') as f:
    ecf_list0 = pickle.load(f)
with open('ratios1.pkl', 'rb') as f:
    ecf_list1 = pickle.load(f)
with open('ratios2.pkl', 'rb') as f:
    ecf_list2 = pickle.load(f)

In [10]:
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal0.json', 'r') as f:
    ecf_json0 = json.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal1.json', 'r') as f:
    ecf_json1 = json.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal2.json', 'r') as f:
    ecf_json2 = json.load(f)

In [11]:
with open('ratios0.json', 'r') as f:
    ecf_json0 = json.load(f)
with open('ratios1.json', 'r') as f:
    ecf_json1 = json.load(f)
with open('ratios2.json', 'r') as f:
    ecf_json2 = json.load(f)

In [12]:
ecf_list = ecf_list0 + ecf_list1 + ecf_list2
ecf_json = (ecf_json0 | ecf_json1) | ecf_json2

In [13]:
class EnergyCorrelatorFunctionTagger(torch_wrapper):
    def prepare_awkward(self, events, scaler):
        fatjets = events

        imap = {
            'vars': {
                f'{use_ecf}': fatjets.ecf,
            },
        }

        retmap = {
            k: ak.concatenate([x[:, np.newaxis] for x in imap[k].values()], axis=1)
            for k in imap.keys()
        }
        x = ak.values_astype(scaler.transform(retmap['vars']), "float32")
        return (x,), {}

In [14]:
def get_cut(qcd_scores, break_val):
    hrange=(ak.min(qcd_scores), ak.max(qcd_scores))
    proportion=1.0
    i = 0
    while proportion > 0.15:
        qcd_hist = np.histogram(qcd_scores, bins=10000, 
                     range=hrange
                    )
        largest_bin_indices = np.argsort(qcd_hist[0])[-100:]
        largest_bin_vals = qcd_hist[1][largest_bin_indices]
        hrange = (largest_bin_vals[0], ak.max(qcd_scores))
        proportion = sum(qcd_hist[0])/len(qcd_scores)
        # print(proportion)
        i += 1
        if i > break_val:
            break
    cumulative_distribution = np.cumsum(qcd_hist[0][min(largest_bin_indices):max(largest_bin_indices)])
    total_count = cumulative_distribution[-1]
    half_count = total_count / 2
    median_bin_index = np.where(cumulative_distribution >= half_count)[0][0]
    cut = qcd_hist[1][median_bin_index]
    return cut

In [15]:
with open('../jsons/subregion_event_totals.json', 'r') as f:
    totals = json.load(f)
with open('../jsons/my_xsecs.json', 'r') as f:
    xsecs = json.load(f)

In [18]:
a = 0
for use_ecf in ecf_list[:]:
    index = ecf_json[use_ecf]['index']
    model = f'/scratch365/cmoore24/training/hgg/binary/ecfs_project/outputs/models/traced_model{index}.pt'
    scaler = f'/scratch365/cmoore24/training/hgg/binary/ecfs_project/outputs/scalers/scaler{index}.pkl'
    with open(scaler, 'rb') as f:
        scaler = pickle.load(f)
        
    dash = use_ecf.find('/')
    asterisk = use_ecf.find('*')
    numerator = use_ecf[:dash]
    denominator = use_ecf[dash+1:asterisk]
    exponent = use_ecf[asterisk+2:]

    num_hgg = hgg.ECFs[numerator]
    den_hgg = hgg.ECFs[denominator]
    hgg_ecf = num_hgg/(den_hgg**float(exponent))
    nan_mask = np.isnan(hgg_ecf)
    hgg_ecf = hgg_ecf[~nan_mask]

    hgg_sub_array = ak.zip({
                        'ecf': hgg_ecf, 
                        # 'ecf': hgg.d2b1,
                        'msoftdrop': hgg.msoftdrop[~nan_mask],
                        },
                       depth_limit=1,
                      )

    hgg_sub_array = hgg_sub_array[~(hgg_sub_array.ecf == np.inf)]

    num_qcd = qcd.ECFs[numerator]
    den_qcd = qcd.ECFs[denominator]
    qcd_ecf = num_qcd/(den_qcd**float(exponent))
    nan_mask = np.isnan(qcd_ecf)
    qcd_ecf = qcd_ecf[~nan_mask]

    qcd_sub_array = ak.zip({
                        'ecf': qcd_ecf, 
                        # 'ecf': qcd.d2b1,
                        'msoftdrop': qcd.msoftdrop[~nan_mask],
                        },
                       depth_limit=1,
                      )

    qcd_sub_array = qcd_sub_array[~(qcd_sub_array.ecf == np.inf)]
    
    tagger = EnergyCorrelatorFunctionTagger(model)
    hgg_scores = tagger(hgg_sub_array, scaler)[:,0]
    qcd_scores = tagger(qcd_sub_array, scaler)[:,0]

    bkg_zeros = ak.zeros_like(qcd_scores)
    sig_ones = ak.ones_like(hgg_scores)
    combined = ak.concatenate([qcd_scores,hgg_scores])
    combined_truth = ak.concatenate([bkg_zeros, sig_ones])

    try:
        fpr, tpr, thresholds = roc_curve(combined_truth, combined)
    except:
        continue
    roc_auc = auc(fpr, tpr)
    if roc_auc < 0.6:
        continue

    cut = get_cut(qcd_scores, 49)

    mask = ~((qcd_scores > cut))
    qcd_cut_msd = qcd_sub_array.msoftdrop[mask]
    qcd_fail_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='QCD MSD').Weight()
    qcd_fail_hist.fill(msd=qcd_cut_msd);

    mask = ((qcd_scores > cut))
    qcd_cut_msd = qcd_sub_array.msoftdrop[mask]
    qcd_pass_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='QCD MSD').Weight()
    qcd_pass_hist.fill(msd=qcd_cut_msd);

    scale = ((44.99*(xsecs['qcd']['qcd_470to600']*1000))/totals['qcd']['470to600'])
    qcd_pass_hist.view(flow=True)[:] *= scale
    qcd_fail_hist.view(flow=True)[:] *= scale

    total_qcd_hist = qcd_pass_hist + qcd_fail_hist

    sculpt_value = sum(total_qcd_hist[30:].density() - qcd_pass_hist[30:].density())
    if sculpt_value <= a:
        a = sculpt_value
        print(a)
        print(use_ecf)
        print('\n')

-1.734723475976807e-17
1e3^0.5/1e2^1.0**0.5


-5.898059818321144e-17
1e3^0.5/1e2^1.5**0.3333333333333333


-5.898059818321144e-17
2e3^3.5/1e2^1.0**7.0


-7.28583859910259e-17
1e4^0.5/1e3^2.0**0.25


-7.28583859910259e-17
1e4^0.5/2e3^0.5**0.5


-1.0061396160665481e-16
1e4^0.5/2e3^4.0**0.0625


-1.0408340855860843e-16
1e4^0.5/3e3^1.0**0.16666666666666666


-1.1449174941446927e-16
1e4^2.0/1e3^2.5**0.8


-1.1796119636642288e-16
4e4^1.5/2e3^0.5**6.0


-1.214306433183765e-16
4e5^0.5/1e3^2.5**0.8


-1.249000902703301e-16
4e5^3.0/2e3^0.5**12.0




  return self._module.asarray(obj, dtype=dtype)
  return self._module.asarray(obj, dtype=dtype)
  return self._module.asarray(obj, dtype=dtype)
  return self._module.asarray(obj, dtype=dtype)
  return self._module.asarray(obj, dtype=dtype)


KeyboardInterrupt: 

In [None]:
# 1e2^0.5/1e2^2.0**0.25 / 0.637
# 1e2^4.0/1e2^2.0**2.0 / 0.638
# 1e3^0.5/3e3^2.0**0.08333333333333333 / 0.640
# 1e3^1.5/2e3^4.0**0.1875 / 0.622 ***
# 1e3^3.0/2e3^3.0**0.5 / 0.602
# 1e5^2.0/9e5^1.5**0.14814814814814814 / 0.624 ****
# 1e5^2.0/10e5^0.5**0.4 / 0.612 ****
# 1e5^3.0/10e5^1.5**0.2 /0.615 ****
# 1e5^3.5/3e5^2.5**0.4666666666666667 / 0.617 ****
# 2e5^0.5/6e5^1.0**0.16666666666666666 / 0.650
# 2e5^1.5/2e5^3.5**0.42857142857142855 / 0.624 
# 2e5^1.5/6e5^1.5**0.3333333333333333 / 0.619
# 2e5^2.0/1e5^3.5**1.1428571428571428 / 0.609
# 2e5^3.0/1e5^4.0**1.5 / 0.614
# 3e5^1.5/5e5^4.0**0.225 / 0.611
# 3e5^2.0/3e5^3.5**0.5714285714285714 / 0.613
# 3e5^4.0/2e5^2.5**2.4 / 0.609
# 4e5^0.5/8e5^3.0**0.08333333333333333 / 0.638
# 4e5^1.5/1e5^2.5**2.4 / 0.618
# 6e5^1.0/4e5^3.0**0.5 / 0.635
# 7e5^1.0/2e5^1.0**3.5 / 0.634 meh
# 7e5^1.0/9e5^1.0**0.7777777777777778 / 0.619
# 8e5^0.5/5e5^2.0**0.4 / 0.668
# 9e5^0.5/6e5^3.5**0.21428571428571427 / 0.646
# 9e5^1.0/8e5^3.0**0.375 / 0.613
# 9e5^1.5/2e5^2.0**3.375 / 0.608
# 10e5^0.5/8e5^3.5**0.17857142857142858 / 0.637
# 10e5^1.0/6e5^3.5**0.47619047619047616 / 0.609
# 10e5^1.0/10e5^2.5**0.4 / 0.609
# 
# 
# 2e3^1.0/1e2^3.5**0.5714285714285714 / 0.602
# 3e3^0.5/1e2^3.5**0.42857142857142855 / 0.628
# 1e4^0.5/1e2^3.5**0.14285714285714285 / 0.641
# 1e4^0.5/1e3^3.0**0.16666666666666666 / 0.650
# 1e4^0.5/2e3^1.5**0.16666666666666666 / 0.648
# 1e4^1.0/1e3^2.5**0.4 / 0.628
# 1e4^1.0/1e3^3.5**0.2857142857142857 / 0.629
# 1e4^1.0/2e3^2.0**0.25 / 0.628
# 1e4^1.0/2e3^3.0**0.16666666666666666 / 0.630
# 1e4^1.0/3e3^1.5**0.2222222222222222 / 0.628
# 1e4^1.5/1e3^4.0**0.375 / 0.621
# 1e4^1.5/2e3^1.0**0.75 / 0.605
# 1e4^1.5/2e3^3.0**0.25 / 0.621
# 1e4^2.0/1e3^4.0**0.5 / 0.615
# 1e4^2.0/2e3^2.0**0.5 / 0.611
# 1e4^2.0/2e3^3.0**0.3333333333333333 / 0.615
# 1e4^2.0/2e3^4.0**0.25 / 0.617
# 1e4^2.0/3e3^1.5**0.4444444444444444 / 0.610
# 1e4^2.5/2e3^3.0**0.4166666666666667 / 0.610
# 2e4^0.5/1e3^4.0**0.25 / 0.643
# 2e4^1.0/3e3^2.0**0.3333333333333333 / 0.621
# 3e4^0.5/1e3^0.5**3.0 / 0.631
# 3e4^0.5/2e3^3.0**0.25 / 0.644
# 3e4^1.0/3e3^1.5**0.6666666666666666 / 0.617
# 4e4^0.5/3e3^1.0**0.6666666666666666 / 0.643
# 4e4^1.5/2e3^3.5**0.8571428571428571 / 0.613
# 5e4^0.5/2e3^3.5**0.35714285714285715 / 0.657
# 6e4^0.5/3e3^2.0**0.5 / 0.659
# 1e5^1.0/1e3^2.5**0.4 / 0.638
# 1e5^1.0/3e3^2.5**0.13333333333333333 / 0.640
# 1e5^1.5/3e3^3.5**0.14285714285714285 / 0.623
# 1e5^1.5/3e4^2.0**0.25 / 0.630
# 1e5^2.0/3e3^4.0**0.16666666666666666 / 0.623
# 1e5^3.0/2e3^2.5**0.6 / 0.608
# 1e5^3.0/3e3^4.0**0.25 / 0.615
# 2e5^0.5/1e3^4.0**0.25 / 0.647
# 2e5^0.5/2e3^0.5**1.0 / 0.635
# 2e5^0.5/3e3^2.0**0.16666666666666666 / 0.646
# 3e5^0.5/3e3^0.5**1.0 / 0.623
# 3e5^1.5/1e3^4.0**1.125 / 0.601
# 4e5^0.5/1e2^3.0**0.6666666666666666 / 0.616
# 4e5^0.5/1e3^2.5**0.8 / 0.644
# 5e5^0.5/1e3^2.5**1.0 / 0.651 MAYBE
# 5e5^0.5/2e3^3.0**0.4166666666666667 / 0.643
# 5e5^0.5/3e3^1.0**0.8333333333333334 / 0.632
# 5e5^1.0/3e3^3.5**0.47619047619047616 / 0.622
# 6e5^0.5/1e2^4.0**0.75 / 0.617
# 6e5^1.5/3e4^3.0**1.0 / 0.622
# 7e5^0.5/3e3^1.5**0.7777777777777778 / 0.645
# 7e5^1.0/2e3^3.0**1.1666666666666667 / 0.613
# 8e5^1.0/3e3^2.0**1.3333333333333333 / 0.600
# 9e5^0.5/3e3^3.5**0.42857142857142855 / 0.653
# 9e5^1.0/3e3^2.5**1.2 / 0.614
# 10e5^0.5/2e3^2.5**1.0 / 0.650
# 10e5^0.5/2e3^3.5**0.7142857142857143 / 0.652