In [1]:
masspoint = 2500

In [2]:
import uproot
import hist

In [3]:
def write_histogram(pname, hin, hout, fout):
    """
    get histogram from input file and write as to output file
    """
    with uproot.open(pname) as fin:
        hist = fin[hin]
        fout[hout] = hist

    return True

def write_hadded_histograms(pnames, hin, hout, fout):
    
    with uproot.open(pnames[0]) as fin:
        hist = fin[hin].to_hist()
        
    for i in range(1, len(pnames)):
        with uproot.open(pnames[i]) as fin:
            hist += fin[hin].to_hist()
        
    fout[hout] = hist

def getCorrelationString(year, correlations):
    correlationstring = ""
    for entry in correlations:
        if(year in entry):
            for year in entry: correlationstring += year
    if correlationstring == "": correlationstring + "not_applicable"
        
    return correlationstring

In [4]:
# Chars per column
N1 = 79
N2 = 8
N3 = 20

# just set the base path as global here
base_path = "/nfs/dust/cms/user/flabe/TstarTstar/data/DNN/"
file_prefix = "uhh2.AnalysisModuleRunner."
sensitiveVariable = "pt_ST"

class Datacard():

    def __init__(self, year, mass_point, svar, channel, region, processes):
        self.year = year
        self.mass_point = mass_point
        self.svar = svar
        self.channel = channel
        self.region = region
        self.fname = f"{self.mass_point}_{self.svar}_{self.channel}_{self.region}"
        self.processes = processes.copy()
        self.processes.append("datadriven")
        self.generate_datacard()

    def write_block_header(self, f, block_name: str):
        f.write(f"# {block_name.capitalize()}\n")
        f.write(N1 * "-" + "\n")

    def write_parameters(self, f):
        self.write_block_header(f, "parameters")
        f.write(f"imax 1\njmax {len(self.processes)-1}\nkmax *\n")
        f.write(f"shapes * {self.region} {self.fname}.root "
                f"$PROCESS $PROCESS_$SYSTEMATIC\n\n")

    def write_channels(self, f):
        self.write_block_header(f, "channels")
        f.write(f"bin          {self.region}\n")
        f.write("observation  -1\n\n")

    def pad(self, s, n_pad):
        s = str(s)
        n_pad = n_pad - len(s)
        return s + n_pad * ' '

    def write_processes(self, f):
        padded_processes = [self.pad(x, N3) for x in self.processes]
        padded_ids = [self.pad(i , N3) for i in range(len(self.processes))]
        self.write_block_header(f, "processes")
        f.write(self.pad("bin", N1 + N2) + len(self.processes) * self.pad(self.region, N3) + "\n")
        f.write(self.pad("process", N1 + N2) + "".join(padded_processes) + "\n")
        f.write(self.pad("process", N1 + N2) + "".join(padded_ids) + "\n")
        f.write(self.pad("rate", N1 + N2) + len(self.processes) * self.pad("-1", N3))
        f.write("\n\n")

    def write_lnN_systematics(self, f):
        self.write_block_header(f, "systematics")
        for nuisance in norm_uncertainties:
            f.write(self.pad(nuisance, N1) + self.pad("lnN", 8))
            for process in self.processes:
                if process in norm_uncertainties[nuisance]:
                    np_val = norm_uncertainties[nuisance][process]
                    if isinstance(np_val, dict):
                        np_val = np_val[self.year]
                    f.write(self.pad(np_val, N3))
                else:
                    f.write(self.pad("-", N3))
            f.write("\n")

    def write_shape_systematics(self, f):
        for shape_np in shape_uncertanties:
            applicable_processes = shape_uncertanties[shape_np][0]
            correlations = shape_uncertanties[shape_np][1]
            
            correlationstring = getCorrelationString(self.year, correlations)
            
            f.write(self.pad(shape_np + "_" + correlationstring, N1) + self.pad("shape", 8))
            
            for process in self.processes:
                if process in applicable_processes:
                    f.write(self.pad(1, N3))
                else:
                    f.write(self.pad("-", N3))
            f.write("\n")
            
    def write_JECJER(self, f):
        # both JEC and JER will be treated as uncorrelated between years
        
        correlationstring = self.year
        f.write(self.pad("JEC_" + correlationstring, N1) + self.pad("shape", 8))
        for process in self.processes:
            if process == "datadriven":
                f.write(self.pad("-", N3))
            else:
                f.write(self.pad(1, N3))
        f.write("\n")
        
        f.write(self.pad("JER_" + correlationstring, N1) + self.pad("shape", 8))
        for process in self.processes:
            if process == "datadriven":
                f.write(self.pad("-", N3))
            else:
                f.write(self.pad(1, N3))
        f.write("\n")
        
    def write_PDF_MCscale(self, f):
        # these will be treated as correlated between years, but uncorrelated between samples!
        
        correlationstring = "UL16UL17UL18"
        
        for process in self.processes:
            if process == "datadriven": continue
            f.write(self.pad("PDF_" + process +  "_" + correlationstring, N1) + self.pad("shape", 8))
            for process2 in self.processes:
                if process == process2:
                    f.write(self.pad(1, N3))
                else:
                    f.write(self.pad("-", N3))
            f.write("\n")
                    
        for process in self.processes:
            if process == "datadriven": continue
            f.write(self.pad("MCscale_" + process +  "_" + correlationstring, N1) + self.pad("shape", 8))
            for process2 in self.processes:
                if process == process2:
                    f.write(self.pad(1, N3))
                else:
                    f.write(self.pad("-", N3))
            f.write("\n")
            
    def write_datadriven(self, f):
        # these will be correlated through the years, and only apply to 
        
        correlationstring = "UL16UL17UL18"
        f.write(self.pad("datadrivenFitFunction_" + correlationstring, N1) + self.pad("shape", 8))
        for process in self.processes:
            if process == "datadriven":
                f.write(self.pad(1, N3))
            else:
                f.write(self.pad("-", N3))
        f.write("\n")

    def generate_datacard(self):
        with open(f"cards/{self.year}/{self.fname}.dat", 'w') as f:
            self.write_parameters(f)
            self.write_channels(f)
            self.write_processes(f)
            self.write_lnN_systematics(f)
            self.write_shape_systematics(f)
            #self.write_JECJER(f)
            #self.write_PDF_MCscale(f)
            self.write_datadriven(f)
            f.write("* autoMCStats 10")

    def create_rootfile(self):
        with uproot.recreate(f"cards/{self.year}/{self.fname}.root") as fout:
            
            shape_path = base_path + self.year + "/hadded/"
            shape_folder = "SignalRegion_" + self.channel
        
            # moving the nominal ones
            all_pnames = []
            for process in self.processes:
                if not process == "datadriven":
                    hin_base = shape_folder + "/" + sensitiveVariable
                    pname = shape_path + file_prefix + "MC." + process + ".root"
                    all_pnames.append(pname)
                    write_histogram(pname, hin_base + "_nominal", process, fout)
                        
            # data
            hin_base = shape_folder + "/" + sensitiveVariable
            pname = shape_path + file_prefix + "DATA.DATA.root"
            write_histogram(pname, hin_base + "_nominal", "data_obs", fout)
            
            # as a fix for the moment, I'll write the total background as data!
            # write_hadded_histograms(all_pnames, hin_base + "_nominal", "data_obs", fout)
                    
            # move histograms for "normal" shape systematics
            for shape_np in shape_uncertanties:
                applicable_processes = shape_uncertanties[shape_np][0]
                correlations = shape_uncertanties[shape_np][1]
                
                correlationstring = getCorrelationString(self.year, correlations)
                
                for process in self.processes:
                    if process in applicable_processes:
                        pname = shape_path + file_prefix + "MC." + process + ".root"
                        hin_base = shape_folder + "/" + sensitiveVariable
                        write_histogram(pname, hin_base + "_" + shape_np + "Up",
                                        process + "_" + shape_np + "_" + correlationstring + "Up", fout)
                        write_histogram(pname, hin_base + "_" + shape_np + "Down",
                                        process + "_" + shape_np + "_" + correlationstring + "Down", fout)

                    
            # JEC and JER
            correlationstring = self.year
            for JE in ["JEC", "JER"]:
                for direction in ["up", "down"]:
                    JECJER_path = base_path + self.year + "/" + JE + "_" + direction + "/hadded/"
                    
                    for process in self.processes:
                        if not process == "datadriven":
                            hin_base = shape_folder + "/" + sensitiveVariable
                            pname = shape_path + file_prefix + "MC." + process + ".root"
                            write_histogram(pname, hin_base + "_nominal",
                                            process + "_" + JE + "_" + correlationstring + direction.capitalize(), fout)
            
                        
            # datadriven
            datadriven_base_path = "/nfs/dust/cms/user/flabe/TstarTstar/data/DNN_datadriven"
            
            correlationstring = self.year
            pname = datadriven_base_path + "/" + self.year + "/hadded/uhh2.AnalysisModuleRunner.DATA.datadrivenBG.root"
            baseline = "SignalRegion_" + self.channel + "/" + sensitiveVariable  + "_nominal"
            write_histogram(pname, baseline , "datadriven", fout)
            
            # datadriven variations
            variations = "SR_datadrivenUp_" + self.channel + "/" + sensitiveVariable  + "_nominal"
            write_histogram(pname, variations , "datadriven_datadrivenFitFunction_UL16UL17UL18Up", fout)
            variations = "SR_datadrivenDown_" + self.channel + "/" + sensitiveVariable  + "_nominal"
            write_histogram(pname, variations , "datadriven_datadrivenFitFunction_UL16UL17UL18Down", fout)
            
            print("ATTENTION SKIP SOME FOR TESTS")
            return 1
            
            # PDF & scale
            external_base_path = "/nfs/dust/cms/user/flabe/TstarTstar/ULegacy/CMSSW_10_6_28/src/UHH2/TstarTstar/macros/rootmakros/files"

            correlationstring = "UL16UL17UL18"
            for what in ["PDF", "scale"]:
                for process in self.processes:
                    if not process == "datadriven":
                        pname = external_base_path + "/" + what + "_" + self.year + "_" + process + ".root"
                        write_histogram(pname, process + "_" + what +"_up",
                                                process + "_" + what + "_" + correlationstring + "Up", fout)
                        write_histogram(pname, process + "_" + what + "_down",
                                                process + "_" + what + "_" + correlationstring + "Down", fout)


In [5]:
# first, lets define a few configurations

years = ["UL16", "UL17", "UL18"] # UL16 will be combined by hadding
channels = ["electron", "muon"] # splitting electron and muon channel
MC_samples = ["TstarTstar_M-"+str(masspoint), "TTbar", "ST"] # only top backgrounds are taken from MC

In [6]:
# define normalisation uncertainties for samples and years
norm_uncertainties = {
    "lumi_13TeV_UL16": {
      p: {"UL16": 1.01, "UL17": '-', "UL18": '-'} for p in MC_samples
    },
    "lumi_13TeV_UL17": {
      p: {"UL16": '-', "UL17": 1.02, "UL18": '-'} for p in MC_samples
    },
    "lumi_13TeV_UL18": {
      p: {"UL16": '-', "UL17": '-', "UL18": 1.015} for p in MC_samples
    },
    "lumi_13TeV_UL16UL17UL18": {
      p: {"UL16": 1.006, "UL17": 1.009, "UL18": 1.02} for p in MC_samples
    },
    "lumi_13TeV_UL17UL18": {
      p: {"UL16": '-', "UL17": 1.006, "UL18": 1.002} for p in MC_samples
    },
}
norm_uncertainties

{'lumi_13TeV_UL16': {'TstarTstar_M-2500': {'UL16': 1.01,
   'UL17': '-',
   'UL18': '-'},
  'TTbar': {'UL16': 1.01, 'UL17': '-', 'UL18': '-'},
  'ST': {'UL16': 1.01, 'UL17': '-', 'UL18': '-'}},
 'lumi_13TeV_UL17': {'TstarTstar_M-2500': {'UL16': '-',
   'UL17': 1.02,
   'UL18': '-'},
  'TTbar': {'UL16': '-', 'UL17': 1.02, 'UL18': '-'},
  'ST': {'UL16': '-', 'UL17': 1.02, 'UL18': '-'}},
 'lumi_13TeV_UL18': {'TstarTstar_M-2500': {'UL16': '-',
   'UL17': '-',
   'UL18': 1.015},
  'TTbar': {'UL16': '-', 'UL17': '-', 'UL18': 1.015},
  'ST': {'UL16': '-', 'UL17': '-', 'UL18': 1.015}},
 'lumi_13TeV_UL16UL17UL18': {'TstarTstar_M-2500': {'UL16': 1.006,
   'UL17': 1.009,
   'UL18': 1.02},
  'TTbar': {'UL16': 1.006, 'UL17': 1.009, 'UL18': 1.02},
  'ST': {'UL16': 1.006, 'UL17': 1.009, 'UL18': 1.02}},
 'lumi_13TeV_UL17UL18': {'TstarTstar_M-2500': {'UL16': '-',
   'UL17': 1.006,
   'UL18': 1.002},
  'TTbar': {'UL16': '-', 'UL17': 1.006, 'UL18': 1.002},
  'ST': {'UL16': '-', 'UL17': 1.006, 'UL18': 1.0

In [7]:
# shape uncertainties that can be read as "pt_ST_<systematic><Variation>" where variation is "Up" or "Down"
# these, by construction, are only relevant for the systematics taken from MC
# planned structure: dict of variations, containing a tuple: first element defines samples, second defined corr.
shape_uncertanties = {
    "pu": [ MC_samples , [years] ], # fully correlated
    "prefiring": [ MC_samples ,  [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "btagging_hf": [ MC_samples , [years] ], # fully correlated
    "btagging_hfstats1": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "btagging_hfstats2": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "btagging_lf": [ MC_samples , [years] ], # fully correlated
    "btagging_lfstats1": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "btagging_lfstats2": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "btagging_cferr1": [ MC_samples , [years] ], # fully correlated
    "btagging_cferr2": [ MC_samples , [years] ], # fully correlated
    "sfelec_id": [ MC_samples , [years] ], # fully correlated
    "sfelec_reco": [ MC_samples , [years] ], # fully correlated
    "sfelec_trigger": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "sfmu_id": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "sfmu_iso": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
    "sfmu_trigger": [ MC_samples , [["UL16"], ["UL17"], ["UL18"]] ], # uncorrelated
}

# additionally, we need to handle murmuf variations, and JEC/JER as well as pdfs
# these are stored in some other files usually

In [8]:
datacard = Datacard(years[2], masspoint, "pt_ST", "mu", "SR", MC_samples)

In [9]:
datacard.create_rootfile()

ATTENTION SKIP SOME FOR TESTS


1