In [2]:
%matplotlib inline
import pandas as pd
import uproot
import matplotlib.pyplot as plt
import numpy as np
import glob
import uproot4

from dask.distributed import Client, LocalCluster
from yahist import Hist1D, Hist2D

from condor_utils import make_htcondor_cluster
from utils import get_results, clear_tree_cache, plot_timeflow

## Skim Parameters

In [26]:
year         = '2018'
skim_version = 'v4'  #currently available x3 or v4

## cross section (wrong ZH xs??)

In [27]:
xs = {}

xs['2018'] = {
    "DYJets"            : 6529.0,
    "ttbar"             : 831.76, 
    "ttG"               : 4.078, 
    "ttGG"              : 0.01687, 
    "ZG"                : 55.6, 
    "WG"                : 191.4,
    "GJets_HT40To100"   : 18640.0, 
    "GJets_HT100To200"  : 8631.0,
    "GJets_HT200To400"  : 2185.0,
    "GJets_HT400To600"  : 257.7,
    "GJets_HT600ToInf"  : 85.4,
    "QCD_pT30To40"      : 24810.0,
    "QCD_pT40ToInf"     : 118100.0,
    "Diphoton"          : 84.4,
    "ZH"                : 0.002006453, #https://github.com/cms-analysis/flashgg/blob/dev_legacy_runII/MetaData/data/cross_sections.json
    "VH"                : 0.00512, #https://github.com/cmstas/HggAnalysisDev/blob/main/Preselection/data/samples_and_scale1fb_ttH.json#L394
    "signal"            : 0.0098 #fb, all bkg are in pb
} 

xs['2017'] = xs['2018']

xs['2016'] = {
    "DYJets"            : 5941.0,
    "ttbar"             : 830., 
    "ttG"               : 3.819, 
    "ttGG"              : 0.01731, 
    "ZG"                : 123.8, 
    "WG"                : 510.6,
    "GJets_HT40To100"   : 23100.0, 
    "GJets_HT100To200"  : 9110.0,
    "GJets_HT200To400"  : 2280.0,
    "GJets_HT400To600"  : 273.0,
    "GJets_HT600ToInf"  : 94.5,
    "QCD_pT30To40"      : 22110.0,
    "QCD_pT40ToInf"     : 113400.0,
    "Diphoton"          : 84.4,
    "ZH"                : 0.002006453, #https://github.com/cms-analysis/flashgg/blob/dev_legacy_runII/MetaData/data/cross_sections.json
    "VH"                : 0.00512, #https://github.com/cmstas/HggAnalysisDev/blob/main/Preselection/data/samples_and_scale1fb_ttH.json#L394
    "signal"            : 0.0098 #fb, all bkg are in pb
}

xs

{'2018': {'DYJets': 6529.0,
  'ttbar': 831.76,
  'ttG': 4.078,
  'ttGG': 0.01687,
  'ZG': 55.6,
  'WG': 191.4,
  'GJets_HT40To100': 18640.0,
  'GJets_HT100To200': 8631.0,
  'GJets_HT200To400': 2185.0,
  'GJets_HT400To600': 257.7,
  'GJets_HT600ToInf': 85.4,
  'QCD_pT30To40': 24810.0,
  'QCD_pT40ToInf': 118100.0,
  'Diphoton': 84.4,
  'ZH': 0.002006453,
  'VH': 0.00512,
  'signal': 0.0098},
 '2017': {'DYJets': 6529.0,
  'ttbar': 831.76,
  'ttG': 4.078,
  'ttGG': 0.01687,
  'ZG': 55.6,
  'WG': 191.4,
  'GJets_HT40To100': 18640.0,
  'GJets_HT100To200': 8631.0,
  'GJets_HT200To400': 2185.0,
  'GJets_HT400To600': 257.7,
  'GJets_HT600ToInf': 85.4,
  'QCD_pT30To40': 24810.0,
  'QCD_pT40ToInf': 118100.0,
  'Diphoton': 84.4,
  'ZH': 0.002006453,
  'VH': 0.00512,
  'signal': 0.0098},
 '2016': {'DYJets': 5941.0,
  'ttbar': 830.0,
  'ttG': 3.819,
  'ttGG': 0.01731,
  'ZG': 123.8,
  'WG': 510.6,
  'GJets_HT40To100': 23100.0,
  'GJets_HT100To200': 9110.0,
  'GJets_HT200To400': 2280.0,
  'GJets_HT40

In [29]:
import json
filename = './metadata/xsection_'+skim_version+'.json'
with open(filename, "w") as f:
    json.dump(xs, f, indent=4)

## total weight

In [5]:
#from dask.distributed import Client
#client = Client('tcp://169.228.130.5:18875')
#client

In [7]:
from dask.distributed import Client
client = Client(memory_limit='4GB', n_workers=24, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:26675  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 24  Cores: 24  Memory: 96.00 GB


In [36]:
client.shutdown()

In [22]:
def get_totWeight(args):
    fname = args[0]
    f = uproot4.open(fname)
    t = f["Runs"]
    return np.sum(t["genEventSumw"].array())

In [33]:
from dask.distributed import as_completed

futures_w = {}
results_w = {}

for key in chunks:
    if "EGamma" in key or "DoubleEG" in key: continue
    print ("start process {}".format(key))
    futures_w[key] = client.map(get_totWeight, chunks[key], retries=5)
    
    results_local = []
    while len(results_local) < len(chunks[key]): 
        ac = as_completed(futures_w[key], with_results=True)
        for future, result in ac:
            results_local.append(result)
            
    results_w[key] = np.sum(np.array(results_local))

start process DYJets
start process ttbar
start process ttG
start process ttGG
start process ZG
start process WG
start process GJets_HT40To100
start process GJets_HT100To200
start process GJets_HT200To400
start process GJets_HT400To600
start process GJets_HT600ToInf
start process QCD_pT30To40
start process QCD_pT40ToInf
start process Diphoton
start process VH
start process signal


In [34]:
results_w

{'DYJets': 17799598587.564648,
 'ttbar': 292228710764.1156,
 'ttG': 33778755.65431488,
 'ttGG': 25147.44199621101,
 'ZG': 4275148495.5808506,
 'WG': 9350616834.534428,
 'GJets_HT40To100': 7948819.204814303,
 'GJets_HT100To200': 9795369.458845828,
 'GJets_HT200To400': 17788245.78757894,
 'GJets_HT400To600': 4650962.691182763,
 'GJets_HT600ToInf': 4970069.563275842,
 'QCD_pT30To40': 14597800.0,
 'QCD_pT40ToInf': 18997403.0,
 'Diphoton': 6074273.1,
 'VH': 3800454.18850238,
 'signal': 917906.0}

In [35]:
filename_w = './metadata/totalWeights_'+year+'_'+skim_version+'.json'
with open(filename_w, "w") as f:
    json.dump(results_w, f, indent=4)

## file chunks (NOTE: missing QCD & signal for 2018 !)

In [30]:
%%time
filepath     = '/hadoop/cms/store/user/legianni/ProjectMetis/'

DY_files               =  ''
ttbar_files            =  ''
ttGG_files             =  ''
ttG_files              =  ''
ZG_files               =  ''
WG_files               =  ''
GJets_HT40To100_files  =  ''
GJets_HT100To200_files =  ''
GJets_HT200To400_files =  ''
GJets_HT400To600_files =  ''
GJets_HT600ToInf_files =  ''
QCD_pT30To40_files     =  ''
QCD_pT40ToInf_files    =  ''
Diphoton_files         =  ''
bkg_res_ZH             =  ''
bkg_res_VH             =  ''
sig_                   =  ''
data_fileset           =  {}

if year == '2018':
    DY_files               =  filepath + "DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8_18____" + skim_version + "/*root"                 
    ttbar_files            =  filepath + "TTJets_TuneCP5_13TeV-amcatnloFXFX-pythia8_18____" + skim_version + "/*root"
    ttG_files              =  filepath + 'TTGJets_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8_18____' + skim_version + "/*root"
    ttGG_files             =  filepath + 'TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8_18____' + skim_version + "/*root"
    ZG_files               =  filepath + "ZGToLLG_01J_5f_TuneCP5_13TeV-amcatnloFXFX-pythia8_18*____" + skim_version + "/*root"                
    WG_files               =  filepath + "WGToLNuG_01J_5f_TuneCP5_13TeV-amcatnloFXFX-pythia8_18____" + skim_version + "/*root"                         
    GJets_HT40To100_files  =  filepath + "GJets_HT-40To100_TuneCP5_13TeV-madgraphMLM-pythia8_18____" + skim_version + "/*root"                 
    GJets_HT100To200_files =  filepath + "GJets_HT-100To200_TuneCP5_13TeV-madgraphMLM-pythia8_18____" + skim_version + "/*root"                
    GJets_HT200To400_files =  filepath + "GJets_HT-200To400_TuneCP5_13TeV-madgraphMLM-pythia8_18____" + skim_version + "/*root"                
    GJets_HT400To600_files =  filepath + "GJets_HT-400To600_TuneCP5_13TeV-madgraphMLM-pythia8_18____" + skim_version + "/*root"                
    GJets_HT600ToInf_files =  filepath + "GJets_HT-600ToInf_TuneCP5_13TeV-madgraphMLM-pythia8_18____" + skim_version + "/*root"                
    #these 2 files seem 2017 rather than 2018
    QCD_pT30To40_files     =  filepath + "QCD_Pt-30to40_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_17____" + skim_version + "/*root"   
    QCD_pT40ToInf_files    =  filepath + "QCD_Pt-40toInf_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_17____" + skim_version + "//*root" 
    Diphoton_files         =  filepath + "DiPhotonJetsBox_MGG-80toInf_13TeV-Sherpa_18____" + skim_version + "/*root"                           
    bkg_res_ZH             =  filepath + "ggZH_HToGG_ZToLL_M125_TuneCP5_13TeV-powheg-pythia8_18____" + skim_version + "/*.root"
    bkg_res_VH             =  filepath + "VHToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8_18____" + skim_version + "/*.root"
    #using 2017 signal sample!!
    sig_                   =  filepath + "HHggtautau_Era2017____" + skim_version + "/*.root"
    data_fileset = {
        "EGamma_2018A":glob.glob(filepath + "EGamma_Run2018A____" + skim_version + "/*root"),
        "EGamma_2018B":glob.glob(filepath + "EGamma_Run2018B____" + skim_version + "/*root"),
        "EGamma_2018C":glob.glob(filepath + "EGamma_Run2018C____" + skim_version + "/*root"),
        "EGamma_2018D":glob.glob(filepath + "EGamma_Run2018D____" + skim_version + "/*root")
    }
    #remove corrupted files :( 
    #data_fileset['EGamma_2018D'].remove('/hadoop/cms/store/user/legianni/ProjectMetis/EGamma_Run2018D____v4/test_nanoaodSkim_367.root')

elif year == '2017':
    DY_files               =  filepath + "DYJetsToLL_M-50_TuneCP5_13TeV-amcatnloFXFX-pythia8_17____" + skim_version + "/*root"                 
    ttbar_files            =  filepath + "TTJets_TuneCP5_13TeV-amcatnloFXFX-pythia8_17____" + skim_version + "/*root"                          
    ttG_files              =  filepath + 'TTGJets_TuneCP5_13TeV-amcatnloFXFX-madspin-pythia8_17*____' + skim_version + "/*root"
    ttGG_files             =  filepath + 'TTGG_0Jets_TuneCP5_13TeV_amcatnlo_madspin_pythia8_17____' + skim_version + "/*root"
    ZG_files               =  filepath + "ZGToLLG_01J_5f_TuneCP5_13TeV-amcatnloFXFX-pythia8_17____" + skim_version + "/*root"                
    WG_files               =  filepath + "WGToLNuG_01J_5f_TuneCP5_13TeV-amcatnloFXFX-pythia8_17____" + skim_version + "/*root"                         
    GJets_HT40To100_files  =  filepath + "GJets_HT-40To100_TuneCP5_13TeV-madgraphMLM-pythia8_17____" + skim_version + "/*root"                 
    GJets_HT100To200_files =  filepath + "GJets_HT-100To200_TuneCP5_13TeV-madgraphMLM-pythia8_17____" + skim_version + "/*root"                
    GJets_HT200To400_files =  filepath + "GJets_HT-200To400_TuneCP5_13TeV-madgraphMLM-pythia8_17____" + skim_version + "/*root"                
    GJets_HT400To600_files =  filepath + "GJets_HT-400To600_TuneCP5_13TeV-madgraphMLM-pythia8_17____" + skim_version + "/*root"                
    GJets_HT600ToInf_files =  filepath + "GJets_HT-600ToInf_TuneCP5_13TeV-madgraphMLM-pythia8_17____" + skim_version + "/*root"                
    QCD_pT30To40_files     =  filepath + "QCD_Pt-30to40_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_17____" + skim_version + "/*root"   
    QCD_pT40ToInf_files    =  filepath + "QCD_Pt-40toInf_DoubleEMEnriched_MGG-80toInf_TuneCP5_13TeV_Pythia8_17____" + skim_version + "//*root" 
    Diphoton_files         =  filepath + "DiPhotonJetsBox_MGG-80toInf_13TeV-Sherpa_17____" + skim_version + "/*root"                           
    bkg_res_ZH             =  filepath + "ggZH_HToGG_ZToLL_M125_13TeV_powheg_pythia8_17____" + skim_version + "/*.root"
    bkg_res_VH             =  filepath + "VHToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8_17____" + skim_version + "/*.root"
    sig_                   =  filepath + "HHggtautau_Era2017____" + skim_version + "/*root"

    data_fileset = {
    "DoubleEG_Run2017B":glob.glob(filepath + "DoubleEG_Run2017B____" + skim_version + "/*root"),
    "DoubleEG_Run2017C":glob.glob(filepath + "DoubleEG_Run2017C____" + skim_version + "/*root"),
    "DoubleEG_Run2017D":glob.glob(filepath + "DoubleEG_Run2017D____" + skim_version + "/*root"),
    "DoubleEG_Run2017E":glob.glob(filepath + "DoubleEG_Run2017E____" + skim_version + "/*root"),
    "DoubleEG_Run2017F":glob.glob(filepath + "DoubleEG_Run2017F____" + skim_version + "/*root")
    }

elif year == '2016':
    DY_files               =  filepath + "DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8_16____" + skim_version + "/*root"                 
    ttbar_files            =  filepath + "TTJets_TuneCUETP8M2T4_13TeV-amcatnloFXFX-pythia8_16____" + skim_version + "/*root"                          
    ttG_files              =  filepath + 'TTGJets_TuneCUETP8M1_13TeV-amcatnloFXFX-madspin-pythia8_16*____' + skim_version + "/*root"
    ttGG_files             =  filepath + 'TTGG_0Jets_TuneCUETP8M1_13TeV_amcatnlo_madspin_pythia8_16____' + skim_version + "/*root"
    ZG_files               =  filepath + "ZGTo2LG_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8_16____" + skim_version + "/*root"                
    WG_files               =  filepath + "WGToLNuG_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8_16*____" + skim_version + "/*root"                         
    GJets_HT40To100_files  =  filepath + "GJets_HT-40To100_TuneCUETP8M1_13TeV-madgraphMLM-pythia8_16*____" + skim_version + "/*root"                 
    GJets_HT100To200_files =  filepath + "GJets_HT-100To200_TuneCUETP8M1_13TeV-madgraphMLM-pythia8_16*____" + skim_version + "/*root"                
    GJets_HT200To400_files =  filepath + "GJets_HT-200To400_TuneCUETP8M1_13TeV-madgraphMLM-pythia8_16*____" + skim_version + "/*root"                
    GJets_HT400To600_files =  filepath + "GJets_HT-400To600_TuneCUETP8M1_13TeV-madgraphMLM-pythia8_16*____" + skim_version + "/*root"                
    GJets_HT600ToInf_files =  filepath + "GJets_HT-600ToInf_TuneCUETP8M1_13TeV-madgraphMLM-pythia8_16*____" + skim_version + "/*root"                
    QCD_pT30To40_files     =  filepath + "QCD_Pt-30to40_DoubleEMEnriched_MGG-80toInf_TuneCUETP8M1_13TeV_Pythia8_16____" + skim_version + "/*root"   
    QCD_pT40ToInf_files    =  filepath + "QCD_Pt-40toInf_DoubleEMEnriched_MGG-80toInf_TuneCUETP8M1_13TeV_Pythia8_16____" + skim_version + "//*root" 
    Diphoton_files         =  filepath + "DiPhotonJetsBox_MGG-80toInf_13TeV-Sherpa_16____" + skim_version + "/*root"                           
    #bkg_res_ZH             =  ''           MISSING??  USE 2017
    bkg_res_ZH             =  filepath + "ggZH_HToGG_ZToLL_M125_13TeV_powheg_pythia8_17____" + skim_version + "/*.root"
    bkg_res_VH             =  filepath + "VHToGG_M125_13TeV_amcatnloFXFX_madspin_pythia8_16____" + skim_version + "/*.root"
    sig_                   =  filepath + "HHggtautau_Era2016____" + skim_version + "/*root"

    data_fileset = {
    "DoubleEG_Run2016B"  :glob.glob(filepath + "DoubleEG_Run2016B____"   + skim_version + "/*root"),
    "DoubleEG_Run2016B-2":glob.glob(filepath + "DoubleEG_Run2016B-2____" + skim_version + "/*root"),
    "DoubleEG_Run2016C"  :glob.glob(filepath + "DoubleEG_Run2016C____"   + skim_version + "/*root"),
    "DoubleEG_Run2016D"  :glob.glob(filepath + "DoubleEG_Run2016D____"   + skim_version + "/*root"),
    "DoubleEG_Run2016E"  :glob.glob(filepath + "DoubleEG_Run2016E____"   + skim_version + "/*root"),
    "DoubleEG_Run2016F"  :glob.glob(filepath + "DoubleEG_Run2016F____"   + skim_version + "/*root"),
    "DoubleEG_Run2016G"  :glob.glob(filepath + "DoubleEG_Run2016G____"   + skim_version + "/*root"),
    "DoubleEG_Run2016H"  :glob.glob(filepath + "DoubleEG_Run2016H____"   + skim_version + "/*root")
    }
else :
    print ('year not recognised.')


import glob
background_fileset = {
    "DYJets": glob.glob(DY_files),
    "ttbar": glob.glob(ttbar_files),
    "ttG": glob.glob(ttG_files),
    "ttGG": glob.glob(ttGG_files),
    "ZG": glob.glob(ZG_files),
    "WG": glob.glob(WG_files),
    "GJets_HT40To100": glob.glob(GJets_HT40To100_files),
    "GJets_HT100To200": glob.glob(GJets_HT100To200_files),
    "GJets_HT200To400": glob.glob(GJets_HT200To400_files),
    "GJets_HT400To600": glob.glob(GJets_HT400To600_files),
    "GJets_HT600ToInf": glob.glob(GJets_HT600ToInf_files),
    "QCD_pT30To40": glob.glob(QCD_pT30To40_files),
    "QCD_pT40ToInf": glob.glob(QCD_pT40ToInf_files),
    "Diphoton": glob.glob(Diphoton_files)
}

background_res_fileset = {
    #"ZH":glob.glob( bkg_res_ZH ),
    "VH":glob.glob( bkg_res_VH ),
} 

signal_fileset = {
    "signal":glob.glob( sig_ ),
}

filesets = {"data": data_fileset,
            "background": background_fileset,
            "background_res": background_res_fileset,
            "signal": signal_fileset}

CPU times: user 106 ms, sys: 26.9 ms, total: 133 ms
Wall time: 510 ms


In [31]:
%%time
import utils

chunk_size = 5e5
chunks = {}
tot_evts = 0
tot_chunks = 0

for cat in ["data", "background", "background_res", "signal"]:
    fileset = filesets[cat]
    for key in fileset:
        fnames = fileset[key]
        cks, evts = utils.get_chunking(tuple(fnames),chunk_size,treename="Events")
        chunks[key] = cks
        tot_evts += evts
        tot_chunks += len(cks)
        print("{}: total {} events, got {} files, return {} chunks".format(key, evts, len(fnames), len(cks))) 
print()
#print(chunks.keys())
print ("total events to be processed: {0:8.2e}".format(tot_evts) )
print ("total chunks: {}".format(tot_chunks))

EGamma_2018A: total 5330291 events, got 240 files, return 240 chunks
EGamma_2018B: total 2930675 events, got 98 files, return 98 chunks
EGamma_2018C: total 2741760 events, got 110 files, return 110 chunks
EGamma_2018D: total 13290273 events, got 426 files, return 426 chunks
DYJets: total 1038 events, got 1 files, return 1 chunks
ttbar: total 218450 events, got 171 files, return 171 chunks
ttG: total 100983 events, got 7 files, return 7 chunks
ttGG: total 297085 events, got 3 files, return 3 chunks
ZG: total 325140 events, got 40 files, return 40 chunks
WG: total 118375 events, got 21 files, return 21 chunks
GJets_HT40To100: total 18333 events, got 6 files, return 6 chunks
GJets_HT100To200: total 41666 events, got 9 files, return 9 chunks
GJets_HT200To400: total 135132 events, got 15 files, return 15 chunks
GJets_HT400To600: total 36974 events, got 7 files, return 7 chunks
GJets_HT600ToInf: total 37204 events, got 9 files, return 9 chunks
QCD_pT30To40: total 5220 events, got 10 files, r

In [32]:
filename_chunk = './metadata/chunklist_'+year+'_'+skim_version+'.json'
with open(filename_chunk, "w") as f:
    json.dump(chunks, f, indent=4)