In [1]:
import numpy as np
import awkward as ak
import hist
import warnings
import pickle
from coffea.ml_tools.torch_wrapper import torch_wrapper
import hist
from sklearn.metrics import roc_curve, auc
import os
import json

In [2]:
warnings.filterwarnings('ignore', 'invalid value')
warnings.filterwarnings('ignore', 'No format')
warnings.filterwarnings('ignore', 'overflow encountered in cast')
warnings.filterwarnings('ignore', 'divide by zero encountered in divide')

In [3]:
hgg = ak.from_parquet('/scratch365/cmoore24/training/data/ecfs/hgg_ecfs.parquet')

In [4]:
qcd = ak.firsts(ak.from_parquet('/scratch365/cmoore24/training/data/ecfs/qcd_ecfs.parquet'))

In [5]:
hgg = hgg[ak.flatten(hgg.msoftdrop < 200)]
hgg = hgg[ak.flatten(hgg.msoftdrop > 40)]
hgg = hgg[ak.flatten(hgg.pt < 600)]
hgg = hgg[ak.flatten(hgg.pt > 450)]

In [6]:
qcd = qcd[qcd.msoftdrop < 200]
qcd = qcd[qcd.msoftdrop > 40]
qcd = qcd[qcd.pt < 600]
qcd = qcd[qcd.pt > 450]

In [7]:
mask = ak.ones_like(hgg[hgg.fields[0]], dtype='bool')
mask = ak.fill_none(mask, True)
for j in hgg.fields:
    if hgg[j].fields == []:
        mask = mask & (~ak.is_none(ak.nan_to_none(hgg[j])))
    else:
        for i in hgg[j].fields:
            mask = mask & (~ak.is_none(ak.nan_to_none(hgg[j][i])))
hgg = hgg[ak.flatten(mask)]

In [8]:
mask = ak.ones_like(qcd[qcd.fields[0]], dtype='bool')
mask = ak.fill_none(mask, True)
for j in qcd.fields:
    if qcd[j].fields == []:
        mask = mask & (~ak.is_none(ak.nan_to_none(qcd[j])))
    else:
        for i in qcd[j].fields:
            mask = mask & (~ak.is_none(ak.nan_to_none(qcd[j][i])))
qcd = qcd[mask]

In [9]:
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal0.pkl', 'rb') as f:
    ecf_list0 = pickle.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal1.pkl', 'rb') as f:
    ecf_list1 = pickle.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal2.pkl', 'rb') as f:
    ecf_list2 = pickle.load(f)

In [10]:
with open('ratios0.pkl', 'rb') as f:
    ecf_list0 = pickle.load(f)
with open('ratios1.pkl', 'rb') as f:
    ecf_list1 = pickle.load(f)
with open('ratios2.pkl', 'rb') as f:
    ecf_list2 = pickle.load(f)

In [11]:
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal0.json', 'r') as f:
    ecf_json0 = json.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal1.json', 'r') as f:
    ecf_json1 = json.load(f)
with open('/scratch365/cmoore24/training/hgg/binary/ecfs_project/ratios_equal2.json', 'r') as f:
    ecf_json2 = json.load(f)

In [12]:
with open('ratios0.json', 'r') as f:
    ecf_json0 = json.load(f)
with open('ratios1.json', 'r') as f:
    ecf_json1 = json.load(f)
with open('ratios2.json', 'r') as f:
    ecf_json2 = json.load(f)

In [13]:
ecf_list = ecf_list0 + ecf_list1 + ecf_list2

In [14]:
ecf_json = (ecf_json0 | ecf_json1) | ecf_json2

In [15]:
with open('../jsons/subregion_event_totals.json', 'r') as f:
    totals = json.load(f)
with open('../jsons/my_xsecs.json', 'r') as f:
    xsecs = json.load(f)

In [16]:
class EnergyCorrelatorFunctionTagger(torch_wrapper):
    def prepare_awkward(self, events, scaler, function):
        fatjets = events
    
        imap = {
            'vars': {
                f'{function}': fatjets.ecf,
            },
        }
    
        retmap = {
            k: ak.concatenate([x[:, np.newaxis] for x in imap[k].values()], axis=1)
            for k in imap.keys()
        }
        x = ak.values_astype(scaler.transform(retmap['vars']), "float32")
        return (x,), {}

In [17]:
def get_cut(qcd_scores, break_val):
    hrange=(ak.min(qcd_scores), ak.max(qcd_scores))
    proportion=1.0
    i = 0
    while proportion > 0.60:
        qcd_hist = np.histogram(qcd_scores, bins=10000, 
                     range=hrange
                    )
        largest_bin_indices = np.argsort(qcd_hist[0])[-100:]
        largest_bin_vals = qcd_hist[1][largest_bin_indices]
        hrange = (largest_bin_vals[0], ak.max(qcd_scores))
        proportion = sum(qcd_hist[0])/len(qcd_scores)
        #print(proportion)
        i += 1
        if i > break_val:
            break
    cumulative_distribution = np.cumsum(qcd_hist[0][min(largest_bin_indices):max(largest_bin_indices)])
    total_count = cumulative_distribution[-1]
    half_count = total_count / 2
    median_bin_index = np.where(cumulative_distribution >= half_count)[0][0]
    cut = qcd_hist[1][median_bin_index]
    return cut

In [18]:
with open('ecf_results.json', 'w') as f:
    json.dump({}, f)

In [19]:
for ecf in ecf_list[:]:
    index = ecf_json[ecf]['index']
    model = f'/scratch365/cmoore24/training/hgg/binary/ecfs_project/outputs/models/traced_model{index}.pt'
    scaler = f'/scratch365/cmoore24/training/hgg/binary/ecfs_project/outputs/scalers/scaler{index}.pkl'
    with open(scaler, 'rb') as f:
        scaler = pickle.load(f)

    dash = ecf.find('/')
    asterisk = ecf.find('*')
    numerator = ecf[:dash]
    denominator = ecf[dash+1:asterisk]
    exponent = ecf[asterisk+2:]

    num_hgg = hgg.ECFs[numerator]
    den_hgg = hgg.ECFs[denominator]
    hgg_ecf = num_hgg/(den_hgg**float(exponent))
    nan_mask = np.isnan(hgg_ecf)
    hgg_ecf = hgg_ecf[~nan_mask]
    
    hgg_sub_array = ak.zip({
                        'ecf': hgg_ecf, 
                        'msoftdrop': hgg.msoftdrop[~nan_mask],
                        },
                       depth_limit=1,
                      )

    hgg_sub_array = hgg_sub_array[~(hgg_sub_array.ecf == np.inf)]

    num_qcd = qcd.ECFs[numerator]
    den_qcd = qcd.ECFs[denominator]
    qcd_ecf = num_qcd/(den_qcd**float(exponent))
    nan_mask = np.isnan(qcd_ecf)
    qcd_ecf = qcd_ecf[~nan_mask]

    qcd_sub_array = ak.zip({
                        'ecf': qcd_ecf, 
                        'msoftdrop': qcd.msoftdrop[~nan_mask],
                        },
                       depth_limit=1,
                      )

    qcd_sub_array = qcd_sub_array[~(qcd_sub_array.ecf == np.inf)]
    
    tagger = EnergyCorrelatorFunctionTagger(model)
    hgg_scores = tagger(hgg_sub_array, scaler, ecf)[:,0]
    qcd_scores = tagger(qcd_sub_array, scaler, ecf)[:,0]

    nan_mask2 = np.isnan(hgg_scores)
    hgg_sub_array = hgg_sub_array[~nan_mask2]
    hgg_scores = hgg_scores[~nan_mask2]

    nan_mask2 = np.isnan(qcd_scores)
    qcd_sub_array = qcd_sub_array[~nan_mask2]
    qcd_scores = qcd_scores[~nan_mask2]
    
    bkg_zeros = ak.zeros_like(qcd_scores)
    sig_ones = ak.ones_like(hgg_scores)
    combined = ak.concatenate([qcd_scores,hgg_scores])
    combined_truth = ak.concatenate([bkg_zeros, sig_ones])

    try:
        fpr, tpr, thresholds = roc_curve(combined_truth, combined)
        roc_auc = auc(fpr, tpr)
    except:
        with open('ecf_results.json', 'r') as f:
            results = json.load(f)
    
        results[ecf] = {'roc_auc': None, 'sculpt_metric': None, 's_sb': None}
    
        with open('ecf_results.json', 'w') as f:
            json.dump(results, f)
        continue


    cut = get_cut(hgg_scores, 50)

    mask = ~((qcd_scores > cut))
    qcd_cut_msd = qcd_sub_array.msoftdrop[mask]
    qcd_fail_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='QCD MSD').Weight()
    qcd_fail_hist.fill(msd=qcd_cut_msd);

    mask = ~((hgg_scores > cut))
    hgg_cut_msd = ak.flatten(hgg_sub_array.msoftdrop[mask])
    hgg_fail_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='Hgg MSD').Weight()
    hgg_fail_hist.fill(msd=hgg_cut_msd);

    mask = ((qcd_scores > cut))
    qcd_cut_msd = qcd_sub_array.msoftdrop[mask]
    qcd_pass_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='QCD MSD').Weight()
    qcd_pass_hist.fill(msd=qcd_cut_msd);

    mask = ((hgg_scores > cut))
    hgg_cut_msd = ak.flatten(hgg_sub_array.msoftdrop[mask])
    hgg_pass_hist = hist.Hist.new.Reg(40, 40, 200, name='msd', label='Hgg MSD').Weight()
    hgg_pass_hist.fill(msd=hgg_cut_msd);

    scale = ((44.99*(xsecs['qcd']['qcd_470to600']*1000))/totals['qcd']['470to600'])
    qcd_pass_hist.view(flow=True)[:] *= scale
    qcd_fail_hist.view(flow=True)[:] *= scale

    scale = ((44.99*(xsecs['hgg']*0.0817*1000))/totals['hgg'])
    hgg_pass_hist.view(flow=True)[:] *= scale
    hgg_fail_hist.view(flow=True)[:] *= scale

    total_qcd_hist = qcd_pass_hist + qcd_fail_hist

    sculpt_metric = sum(abs(total_qcd_hist.density() - qcd_pass_hist.density()))

    sorsb = hgg_pass_hist.values() / np.sqrt(hgg_pass_hist.values() + qcd_pass_hist.values())
    strength = np.sqrt(np.sum(sorsb**2))
    

    with open('ecf_results.json', 'r') as f:
        results = json.load(f)

    if np.isnan(strength):
        results[ecf] = {'roc_auc': roc_auc, 'sculpt_metric': sculpt_metric, 's_sb': None}
    else:
        results[ecf] = {'roc_auc': roc_auc, 'sculpt_metric': sculpt_metric, 's_sb':strength}

    with open('ecf_results.json', 'w') as f:
        json.dump(results, f)

  return impl(*broadcasted_args, **(kwargs or {}))


In [85]:
ecf

'2e4^3.5/2e4^0.5**7.0'

In [86]:
ecf_list.index(ecf)

1274

In [87]:
len(ecf_list)

9184

In [None]:
1187