In [None]:
import uproot
import pandas as pd
import numpy as np
from os import listdir
import os
import h5py
import time
#from ROOT import TH1F
import matplotlib.pyplot as plt

The follwoing file (*infofile.py*) loads a python dictionary (*info*) which stores all the cross-sections and number of events needed to scale each MC process to the right luminosity. The scale factor is given by

$$sf_{\mathscr{L}} = \frac{\sigma[fb]*\mathscr{L}}{N_{ev}^{simulated}}.$$

$N_{ev}^{simulated}$ is the number of originally simulated events for a given MC sample, $\sigma$ is the cross-section in femtobarns [fb] and $\mathscr{L}$ is the integrated luminosity of the data sample. For the 13TeV openData release $\mathscr{L} = 10.6~fb^{-1}$. Note that the cross-section in the infofile is in picobarn [pb] 

In [None]:
from infofile import infos
info = {} 
for key in infos.keys(): 
    ID = infos[key]['DSID']
    info[ID] = {} 
    info[ID]['xsec'] = infos[key]['xsec'] 
    info[ID]['sumw'] = infos[key]['sumw'] 
    info[ID]['events'] = infos[key]['events']

Set the luminosity of the data set

In [None]:
lumi = 10.6

The lists below define the DSIDs to be identified with each background type.

In [None]:
SUSYC1C1 = [392501,392502,392504,392506,392507,392509,392513,392517,392518,392521]

SUSYslsl = [392916,392918,392920,392924,392925,392936,392942,392951,392962,392964,392982,392985,392996,392999]

Zjets = [364100, 364101, 364102, 364103, 364104, 364105, 364106, 364107, 364108, 364109, 364110, 
         364111, 364112, 364113, 364114, 364115, 364116, 364117, 364118, 364119, 364120, 364121, 
         364122, 364123, 364124, 364125, 364126, 364127, 364128, 364129, 364130, 364131, 364132, 
         364133, 364134, 364135, 364136, 364137, 364138, 364139, 364140, 364141]

Wjets = [364156, 364157, 364158, 364159, 364160, 364161, 364162, 364163, 364164, 364165, 364166, 
         364167, 364168, 364169, 364170, 364171, 364172, 364173, 364174, 364175, 364176, 364177, 
         364178, 364179, 364180, 364181, 364182, 364183, 364184, 364185, 364186, 364187, 364188, 
         364189, 364190, 364191, 364192, 364193, 364194, 364195, 364196, 364197]

Diboson = [361600, 361601, 361602, 361603, 361604, 361606, 361607, 361609, 361610] 

Top = [410000, 410011, 410012, 4100013, 410014, 410025, 410026]

Higgs = [341081, 343981, 345041, 345318, 345319]

fileIDs_bkg = {'Diboson':Diboson, 'Zjets':Zjets, 'Wjets':Wjets, 'Top':Top, 'Higgs':Higgs}
fileIDs_sig = {'SUSYC1C1':SUSYC1C1, 'SUSYslsl':SUSYslsl}

This function calculates the scale factor needed for the scaling of MC to the right luminosity. In stead of storing all the individual scale factors, weights and cross-sections this function calculates the final scale factor so that the individual branches (i.e. scaleFactor_\*, mc_weight, etc.) do not need to be stored.

In [None]:
def calc_sf(xsec,lumi,nev,mcWeight,scaleFactor_PILEUP,scaleFactor_ELE,scaleFactor_MUON,scaleFactor_BTAG,scaleFactor_LepTRIGGER):
    if lumi <= 0: 
        print("Lumi {:d} is not valid".format(lumi)) 
        return 0
    wgt = (mcWeight)*(scaleFactor_PILEUP)*(scaleFactor_ELE)*(scaleFactor_MUON)*(scaleFactor_BTAG)*(scaleFactor_LepTRIGGER)
    return wgt * ((xsec*lumi)/nev)

The following function specify the skimming. Due to memory consumption it is not feasible to read all events so some skimming is required

In [None]:
def skimming(lep_n,lep_pt,met):
    if met < 50000: return True
    if lep_n != 2: return True
    if lep_pt[0] < 25000: return True
    if lep_pt[1] < 25000: return True
    return False

Specify the branches to keep in the hd5 files. If you like to check all the availale brances open a file and do events.keys()

In [None]:
branches = ['eventNumber','channelNumber',
            'mcWeight','scaleFactor_PILEUP','scaleFactor_ELE','scaleFactor_MUON',
            'scaleFactor_BTAG','scaleFactor_LepTRIGGER',
            'lep_*','met_*','jet_n']#'jet_*']#,'photon_*','fatjet_*','tau_*',
            #'ditau_m','truth_*']

The following parameters should be set to specify how many events will be read in every chunk, the directory for the files you would like to convert into hd5 files and a tag wich will be appended to the output filename to reflect the skimming applied

In [None]:
# How many events to read in each round
chunksize = 500000
# How often to write to hd5 file (the smaller, the more files)
printevry = 500000
# Directory containing the input file (must end with .root)
#indir   = "/scratch3/eirikgr/openData_13TeV/Data"
indir   = "/scratch3/eirikgr/openData_13TeV/MC/SM_Backgrounds/"
# The output file name tag to store the skim options used abvove
datatype = indir.split("/")[-1]
# In cases there is a trailing / at the end
if not datatype: datatype = indir.split("/")[-2]
print("INFO \t Data type is {:s}".format(datatype))

# Define if it is signal or background (important for ML classification)
isMC = False
isData = False
isSignal = False
if datatype == "SM_Backgrounds":
    isMC = True
elif datatype == "BSM_Signal_Samples":
    isSignal = True
elif datatype == "Data":
    isData = True
else:
    print("ERROR \t Datatype {:s} is unknown. Setting as background".format(datatype))
    isMC = True
    
skimtag = "2L_pt25_25_met50"

In [None]:
# Count how many events and files we write/read
n_allfiles   = 0
nfile   = 0
totev   = 0
totev_skim = 0
out_filenum = 1
try:
    del [result]
except:
    print("WARNING\t Result does not exists. Good :-)")
root_files = [f for f in listdir(indir) if f.endswith('.root')]

# Checking if file exitst (not needed)
#path = indir+"/%s_%s.h5" %(datatype,skimtag)
#if os.path.exists(path):
#    os.remove(path)
#    print("WARNING\t Removing file {:s}".format(path))

for f in root_files:
    n_allfiles += 1
    # Getting the file and extracting the information from info dictionary
    events = uproot.open(indir+"/"+f)["mini"]
    nentries = events.numentries
    print("INFO  \t Opening file {:d}/{:d}: {:s} with {:d} events".format(n_allfiles,len(root_files),f,nentries))
    path = indir+"/%s_%s.h5" %(f,skimtag)
    if not isData:
        file_id = int(f.split('.')[1])
        if not file_id in info.keys():
            print("ERROR \t Could not find any info for file id {:d}. Skipping.".format(file_id))
            continue
        xsec = float(info[file_id]['xsec'])
        nev  = float(info[file_id]['sumw'])
        
    
        # Find the MC type (defined by dictonaries a few cells above)
        mccat = "" 
        if isMC:
            for key in fileIDs_bkg.keys():
                if file_id in fileIDs_bkg[key]:       
                    mccat = key
                    break
        elif isSignal:
            for key in fileIDs_sig.keys():
                if file_id in fileIDs_sig[key]:       
                    mccat = key
                    break
        if not mccat:
            print("ERROR \t Could not find category for DSID {:d}. Skipping".format(file_id))
            continue
        print("INFO  \t ID {:d} in category {:s} has xsec = {:.1f} fb and nev = {:.2f} ".format(file_id,mccat,xsec,nev))
    
    
    n = 1
    prev_n = 0
    
    while True:
        
        # Measure time to read 
        if n == 1: start = time.time()   
        else:   
            end = time.time()
            dur = (end - start)
            dur_sec = chunksize/dur
            m, s = divmod((nentries-((n-1)*chunksize))/dur_sec, 60)
            h, m = divmod(m, 60)
            print("INFO  \t Event/sec  = {:.0f}. ETC = {:d}h{:02d}m{:02d}s".format(dur_sec,int(h),int(m),int(s)))
            start = time.time()
        
        # Get the range of events to read
        next_n = n*chunksize if n*chunksize < nentries else nentries
        print("INFO  \t Reading entries {:d} - {:d} of {:d}. Total so far: {:d}".format(prev_n,next_n,nentries,totev_skim))
        df = events.pandas.df(branches,flatten=False,entrystart=prev_n,entrystop=n*chunksize if n*chunksize < nentries else nentries)
        
        totev += len(df.index)
        
        # If MC: get the scale factor corresponding to the total data luminosity (is set above). 
        # If data: set scale factor to 1!
        if not "data" in f:
            df['wgt'] = np.vectorize(calc_sf)(xsec,lumi,nev,df.mcWeight,
                                              df.scaleFactor_PILEUP,df.scaleFactor_ELE,
                                              df.scaleFactor_MUON,df.scaleFactor_BTAG,
                                              df.scaleFactor_LepTRIGGER)
        else:
            df['wgt'] = 1.0

        # Define if it is signal or background (important for ML classification)
        if isMC or isData:
            df['isSignal'] = False
        elif isSignal:
            df['isSignal'] = True
        else:
            df['isSignal'] = False
            
        df['MCType'] = mccat
        
        ###########   
        # Skimming
        ###########  
        failskim = df[np.vectorize(skimming)(df.lep_n,df.lep_pt,df.met_et)].index
        print("INFO  \t Dropping {:d} events ({:.1f}%)".format(len(failskim),(len(failskim)/len(df.index))*100.))
        df.drop(failskim,inplace=True)
        
        ###########   
        # Slimming
        ###########
        df = df.drop(['lep_trigMatched','lep_truthMatched','mcWeight','scaleFactor_PILEUP','scaleFactor_ELE','scaleFactor_MUON','scaleFactor_BTAG','scaleFactor_LepTRIGGER'],axis=1)

        # If first time, create result data frame. If not; concatenate
        try: 
            result = pd.concat([result,df])
        except:
            result = df
            print("WARNING\t Starting a new result panda")
            
        totev_skim += len(df.index)

        # Delete the temporary data frame
        del [df]
        
        if totev_skim > printevry:
            path = indir+"/%s_%s_num_%i.h5" %(datatype,skimtag,out_filenum)
            result.to_hdf(path,'mini',mode='w',table=True)
            print("INFO  \t Read {:d} events in {:d} files, for which {:d} ({:.2f}%) were written to {:s}"
                .format(totev,nfile,totev_skim,(float(totev_skim)/float(totev))*100.,path))
            out_filenum += 1
            totev = 0
            totev_skim = 0
            nfile = 0
            del [result]
        
        # If read everything
        if n*chunksize > nentries: break

        # Update counters before continuing
        prev_n = n*chunksize + 1
        n += 1
        
    nfile += 1
    del [events]  
    
# Make sure we write the last events
path = indir+"/%s_%s_num_%i.h5" %(datatype,skimtag,out_filenum)
result.to_hdf(path,'mini',mode='w',table=True)
print("INFO  \t Read {:d} events in {:d} files, for which {:d} ({:.2f}%) were written to {:s}"
    .format(totev,nfile,totev_skim,(float(totev_skim)/float(totev))*100.,path))
del [result]