In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport event_selector
%aimport looper_utils
%aimport make_plots

In [4]:
import awkward as ak
import uproot
import numpy as np
from yahist import Hist1D, Hist2D

## Skim Parameters

In [39]:
year = '2018'
skim_version = 'v4'  #currently available x3 or v4

## Load data

In [40]:
%%bash
ls metadata/

chunklist.json
chunklist_2016_v4.json
chunklist_2017_v4.json
chunklist_2018_v4.json
totalWeights.json
totalWeights_2016_v4.json
totalWeights_2017_v4.json
totalWeights_2018_v4.json
xsection.json
xsection_2016_v4.json
xsection_2017_v4.json
xsection_2018_v4.json
xsection_v4.json


In [41]:
import json
with open('./metadata/chunklist_'+year+'_'+skim_version+'.json') as f:
    chunks = json.load(f) 
chunks.keys()

dict_keys(['EGamma_2018A', 'EGamma_2018B', 'EGamma_2018C', 'EGamma_2018D', 'DYJets', 'ttbar', 'ttG', 'ttGG', 'ZG', 'WG', 'GJets_HT40To100', 'GJets_HT100To200', 'GJets_HT200To400', 'GJets_HT400To600', 'GJets_HT600ToInf', 'QCD_pT30To40', 'QCD_pT40ToInf', 'Diphoton', 'VH', 'signal'])

## Process data

In [42]:
def process(args):
    
    import event_selector
    import make_plots
    
    obj_list = ["electron", "muon", "tau", "photon", "others"]
   
    isData = "EGamma" in args[0] or "DoubleEG" in args[0]
    isSigLike = "HHggtautau" in args[0] or "VH" in args[0]
    events = event_selector.prepare_inputs(args, obj_list, isData=isData)
    
    tot_w = ak.sum(events.genWeight)
    mgg = events.ggMass
    
    ## need to change gHidx
    gHidx = event_selector.get_gHidx(args) 
    g_Hidx = gHidx.gHidx
    #mask_diphoton = event_selector.select_photon_byEvent( events.Photon, g_Hidx, mgg, isSigLike )
    mask_diphoton = event_selector.select_photon_byEvent_v1( events.Photon, mgg, isSigLike )
    
    mask_photon = event_selector.select_photon(events.Photon, g_Hidx, mgg)
    photons_selected = events.Photon[mask_photon] 
    nPho = ak.num(photons_selected)
    
    mask_tau = event_selector.select_tau(events.Tau, "all", isTight=False)
    ## clean by dR wrt ALL photons
    mask_tau = mask_tau & looper_utils.mask_by_dR(events.Tau, photons_selected, 0.2)
    #mask_tau = mask_tau & looper_utils.mask_by_dR(events.Tau, events.Photon, 0.2)
    nTau = ak.num(events.Tau[mask_tau])
    
    mask_ele = event_selector.select_electron(events.Electron, isTight=False)
    mask_ele = mask_ele & looper_utils.mask_by_dR(events.Electron, photons_selected, 0.2)
    #mask_ele = mask_ele & looper_utils.mask_by_dR(events.Electron, events.Photon, 0.2)
    nEle = ak.num(events.Electron[mask_ele])
    
    mask_mu = event_selector.select_muon(events.Muon, isTight=False)
    mask_mu = mask_mu & looper_utils.mask_by_dR(events.Muon, photons_selected, 0.2)
    #mask_mu = mask_mu & looper_utils.mask_by_dR(events.Muon, events.Photon, 0.2)
    nMu = ak.num(events.Muon[mask_mu])
    
    sum_charge = (ak.sum(events.Tau[mask_tau].charge, axis=1) + ak.sum(events.Electron[mask_ele].charge, axis=1) + ak.sum(events.Muon[mask_mu].charge, axis=1) == 0)
    #mask_dR_mll_0lep_2tau = looper_utils.mask_by_dR_mll_0lep_2tau(events.Tau[mask_tau], (0.2,3.5), (30,140))
    #mask_dR_mll_1mu_1tau = looper_utils.mask_by_dR_mll_1lep_1tau(events.Tau[mask_tau], events.Muon[mask_mu], (0.3,3.5), (20,120))
    #mask_dR_mll_1ele_1tau = looper_utils.mask_by_dR_mll_1lep_1tau(events.Tau[mask_tau], events.Electron[mask_ele], (0.3,3.5), (20,120))
    mask_dR_mll_0lep_2tau = looper_utils.mask_by_dR_mll_0lep_2tau(events.Tau[mask_tau], (0.2,3.5), (0,500))
    mask_dR_mll_1mu_1tau = looper_utils.mask_by_dR_mll_1lep_1tau(events.Tau[mask_tau], events.Muon[mask_mu], (0.3,3.5), (0,500))
    mask_dR_mll_1ele_1tau = looper_utils.mask_by_dR_mll_1lep_1tau(events.Tau[mask_tau], events.Electron[mask_ele], (0.3,3.5), (0,500))
    
    
    mask_dipho     = mask_diphoton & (nPho == 2) 
    mask_0lep_1tau = mask_dipho    & (nTau == 1) & (nEle == 0) & (nMu == 0)
    mask_0lep_2tau = mask_dipho    & (nTau == 2) & (nEle == 0) & (nMu == 0) & (sum_charge) & (mask_dR_mll_0lep_2tau)
    mask_2lep_0tau = mask_dipho    & (nTau == 0) & (nEle + nMu == 2) & (sum_charge)
    mask_1lep_0tau = mask_dipho    & (nTau == 0) & (nEle + nMu == 1)
      
    mask_1mu_1tau  = mask_dipho    & (nTau == 1) & (nEle == 0) & (nMu == 1) & (sum_charge) & (mask_dR_mll_1mu_1tau)
    mask_1ele_1tau = mask_dipho    & (nTau == 1) & (nEle == 1) & (nMu == 0) & (sum_charge) & (mask_dR_mll_1ele_1tau)
    mask_1lep_1tau = (mask_1mu_1tau) | (mask_1ele_1tau)
    
    masks = {"dipho": mask_dipho,
             "0lep_1tau": mask_0lep_1tau,
             "0lep_2tau": mask_0lep_2tau,
             "1lep_1tau": mask_1lep_1tau,
             "2lep_0tau": mask_2lep_0tau,
             "1lep_0tau": mask_1lep_0tau}
    
    hists = {}
    
    for key, mask in masks.items():
        print (key)
        hists[key] = make_plots.process_event( events[mask], events.genWeight[mask] )
        hists[key].update( make_plots.process_diphoton(events.Photon[mask], g_Hidx[mask], mgg[mask], events.genWeight[mask]) )
        hists[key].update( make_plots.process_tau(events.Tau[mask_tau][mask], events.genWeight[mask]))
        hists[key].update( make_plots.process_muon(events.Muon[mask_mu][mask], events.genWeight[mask]))
        hists[key].update( make_plots.process_electron(events.Electron[mask_ele][mask], events.genWeight[mask]))
        if key == "1lep_1tau":
            tmphists = hists[key].update( make_plots.process_1tau_1lep(events.Tau[mask_tau][mask], events.Muon[mask_mu][mask],
                                                                       events.Electron[mask_ele][mask], events.genWeight[mask]))
    
    return hists

In [11]:
%%time
hists = process(chunks["signal"][0])
#hists = process((chunks["signal"][0][0], 0, 200) )
#hists = process(chunks["DoubleEG_Run2016B"][0])

dipho
0lep_1tau
0lep_2tau
1lep_1tau
2lep_0tau
1lep_0tau
CPU times: user 5.9 s, sys: 78.2 ms, total: 5.97 s
Wall time: 7.16 s


In [12]:
#hists['dipho']['MET']
hists['0lep_2tau']['MET']

bin,content
"(0,5)",21.99 ± 4.68829
"(5,10)",43 ± 6.55744
"(10,15)",65 ± 8.06226
"(15,20)",59 ± 7.68115
[92 rows hidden],[92 rows hidden]
"(480,485)",0 ± 0
"(485,490)",0 ± 0
"(490,495)",0 ± 0
"(495,500)",11 ± 3.31662


In [11]:
hists['dipho']['pho_pT1'].integral
#hists['dipho']['pho_pT1'].integral_error

16344.0

## send to dask

In [1]:
#from dask.distributed import Client
#client = Client('tcp://169.228.130.74:5608')
#client

0,1
Client  Scheduler: tcp://169.228.130.74:5608  Dashboard: http://169.228.130.74:12332/status,Cluster  Workers: 15  Cores: 15  Memory: 60.00 GB


In [14]:
from dask.distributed import Client
client = Client(memory_limit='4GB', n_workers=30, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:24335  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 30  Cores: 30  Memory: 120.00 GB


In [31]:
def g():
    import glob
    allpys = glob.glob("./*py")
    return allpys
g()

['./__init__.py',
 './cachepreload.py',
 './condor_utils.py',
 './event_selector.py',
 './loop.py',
 './looper_utils.py',
 './make_plots.py',
 './utils.py',
 './make_plots_from_json.py']

In [32]:
x = client.submit(g)

In [33]:
client.gather(x)

['./__init__.py',
 './cachepreload.py',
 './condor_utils.py',
 './event_selector.py',
 './loop.py',
 './looper_utils.py',
 './make_plots.py',
 './utils.py',
 './make_plots_from_json.py']

In [None]:
%%time
from dask.distributed import as_completed
import collections, functools, operator 

futures = {}
results = {}

cat_keys = ["dipho", "0lep_1tau", "0lep_2tau", "1lep_1tau", "2lep_0tau", "1lep_0tau"]

for process_key in chunks:
    print ("start process {}".format(process_key))
    futures[process_key] = client.map(process, chunks[process_key], retries=5)
    
    results_local = []
    results[process_key] = {}
    ## do you really need this while?
    while len(results_local) < len(chunks[process_key]): 
        ac = as_completed(futures[process_key], with_results=True)
        for future, result in ac:
            results_local.append(result)
    
    ## merge histograms by the key
    for cat_key in cat_keys: 
        dicts = [results_local[i][cat_key] for i in range(len(results_local))]  
        counter = collections.Counter() 
        for d in dicts:  
            counter.update(d) 
        results[process_key][cat_key] = dict(counter) 

start process EGamma_2018A
start process EGamma_2018B
start process EGamma_2018C
start process EGamma_2018D
start process DYJets
start process ttbar
start process ttG
start process ttGG
start process ZG
start process WG
start process GJets_HT40To100
start process GJets_HT100To200
start process GJets_HT200To400
start process GJets_HT400To600
start process GJets_HT600ToInf
start process QCD_pT30To40
start process QCD_pT40ToInf
start process Diphoton


## save to disk?

- foler1: cat (dipho, 0lep_1tau, 0lep_2tau...)
    + folder2: process (data, signal, ZG...)
        - folder3: hists (pT, eta, phi...)

In [32]:
client.shutdown()

In [35]:
results.keys()

dict_keys(['DoubleEG_Run2017B', 'DoubleEG_Run2017C', 'DoubleEG_Run2017D', 'DoubleEG_Run2017E', 'DoubleEG_Run2017F', 'DYJets', 'ttbar', 'ttG', 'ttGG', 'ZG', 'WG', 'GJets_HT40To100', 'GJets_HT100To200', 'GJets_HT200To400', 'GJets_HT400To600', 'GJets_HT600ToInf', 'QCD_pT30To40', 'QCD_pT40ToInf', 'Diphoton', 'VH', 'signal'])

In [36]:
results['signal'].keys()

dict_keys(['dipho', '0lep_1tau', '0lep_2tau', '1lep_1tau', '2lep_0tau', '1lep_0tau'])

In [37]:
results['signal']['dipho'].keys()

dict_keys(['MET', 'tautau_SVFit', 'tautauA_SVFit', 'tautauL_SVFit', 'pho_pT1', 'pho_pT2', 'pho_pTom1', 'pho_pTom2', 'pho_eta1', 'pho_eta2', 'pho_phi1', 'pho_phi2', 'pho_id1', 'pho_id2', 'tau_pT1', 'tau_eta1', 'tau_phi1', 'tau_deeptau_vs_j_1', 'tau_deeptau_vs_m_1', 'tau_deeptau_vs_e_1', 'n_tau', 'mtautau', 'dR_tautau', 'tau_pT2', 'tau_eta2', 'tau_phi2', 'tau_deeptau_vs_j_2', 'tau_deeptau_vs_m_2', 'tau_deeptau_vs_e_2', 'muon_pT1', 'muon_eta1', 'muon_phi1', 'muon_iso1', 'n_muon', 'mmumu', 'dR_mumu', 'muon_pT2', 'muon_eta2', 'muon_phi2', 'muon_iso2', 'electron_pT1', 'electron_eta1', 'electron_phi1', 'electron_iso1', 'n_electron', 'mee', 'dR_ee', 'electron_pT2', 'electron_eta2', 'electron_phi2', 'electron_iso2'])

In [38]:
%%time
from subprocess import call
import json

process_keys = results.keys()
cat_keys = results['signal'].keys()

#tag = "basic_dR_mll_cut_blind"
tag = "basic_dR_cut"

for cat_key in cat_keys:
    for process_key in process_keys:
        dirname = './hists/' + tag + '/' + cat_key + '/' + process_key + '/'
        call('mkdir -p ' + dirname, shell=True)
        for hist_key in results[process_key][cat_key].keys():
            histname = dirname + hist_key +'_'+year+'_'+skim_version
            results[process_key][cat_key][hist_key].to_json(histname+".json")
            with open(histname, "w") as f:
                data = json.dump(results[process_key][cat_key][hist_key].to_json(histname+".json"), f)

CPU times: user 3.62 s, sys: 4.15 s, total: 7.77 s
Wall time: 12.1 s
