# Coffea processor


In [None]:
from coffea import hist, util

import coffea.processor as processor
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import awkward as ak
import uproot

from pprint import pprint

The code below introduces some basic concepts for writing code using Coffea.

There are three primary pieces to the Coffea code:

The processor, which contains all of the analysis cuts and fills the histogram in the process function.
The second cell defines the files we want to run over and then runs the code using run_uproot_job.
After we run the processor, we can then plot any of the histograms we have generated.
To test any changes you make to the histograms, you will have to rerun each of the three cells below.



In [None]:
class MuonSelector(processor.ProcessorABC):
    def __init__(self):
        # In the initializer, any of the outputs you would like to produce are defined (ex. histograms)

        # Coffea histograms are defined in the same way as in the previous exercise
        # define a list of axes first

        #Declare an axis for the dataset
        dataset_axis = hist.Cat("dataset","Dataset")
        
        #Declare an axis for the muon pt
        muon_pt_axis = hist.Bin("pt","$p_{T}$ [GeV]", 40, 0, 200)
        
        #Define the accumulator object, a dictionary storing all of the histograms and counters 
        #that we will fill later in the process function
        self._accumulator = processor.dict_accumulator({
            'muon_pt': hist.Hist("Counts", dataset_axis, muon_pt_axis),
        }
        )

    @property
    def accumulator(self):
        return self._accumulator

    

    # The process method is where the heart of the analysis is.  
    # This is where all of the selections are done and the histograms get filled 
    #  (things you did in notebook cells before will be done here instead)
    def process(self, events):
        ### The process function is where most of the work happens. As we'll see below, this is
        ### where the main analysis work happens (object cuts, event selections, filling histograms). 
        
        ## This gets us the accumulator dictionary we defined in init
        output = self.accumulator.identity()

        ## To access variables from the ntuples, use the "events" object
        ## The dataset name is part of events.metadata
        dataset = events.metadata['dataset']

        ## The coffea NanoEventSchema packages all muon variables (columns) into the events.Muon object
        ## Each variable can be accessed using muons.key_name
        muons = events.Muon        
        
        ######
        # Select muons with pt >30, eta < 2.4, tight ID, and relIso < 0.15
        muonSelectTight = ((muons.pt>30) &
                           (abs(muons.eta)<2.4) &
                           (muons.tightId) &
                           (muons.pfRelIso04_all < 0.15)
                          )

        # Apply the selection to muons using the array[mask] syntax. 
        # tightMuons only includes the muons that pass the tight selection we defined
        tightMuons = muons[muonSelectTight]
        
        
        # Select events with exactly one tight muon. 
        eventSelection = (ak.num(tightMuons)==1)

        # Fill the muon_pt histogram using the tightMuons in events that pass our selection 
        # Note that ak.flatten() is required when filling a histogram to remove the jaggedness
        output['muon_pt'].fill(dataset=dataset,
                              pt=ak.flatten(tightMuons[eventSelection].pt))

        
        return output

    def postprocess(self, accumulator):
        return accumulator

In [None]:
#Define files to run over
skimDir="/udrive/staff/dnoonan/Skims"
fileset = {"TTGamma":[f"{skimDir}/TTGamma_SingleLept_2016_skim.root"],
           "TTbar":[f"{skimDir}/TTbarPowheg_Semilept_2016_skim_1of10.root",
                    f"{skimDir}/TTbarPowheg_Semilept_2016_skim_2of10.root"],
           "WGamma":[f"{skimDir}/WGamma_2016_skim.root"],
          }


#the NanoAODSchema needs to be adjusted, to remove cross references to FSRPhotons
class SkimmedSchema(NanoAODSchema):
    def __init__(self, base_form):
        base_form["contents"].pop("Muon_fsrPhotonIdx", None)
        super().__init__(base_form)

#Run Coffea code using uproot
output = processor.run_uproot_job(
    fileset,  #dictionary of datasets to run on, defined earlier in this cell
    "Events", #Name of the TTree you will be opening
    MuonSelector(),  #Coffea processor you defined
    processor.futures_executor,
    executor_args={"schema": SkimmedSchema,'workers': 4},  ## workers = 2, parallelize jobs, running 2 at once
    chunksize=100000, #in each chunk, use 50k events
#    maxchunks=3, #limit to using only 3 chunks for each dataset (useful for testing purposes)
)


In [None]:
hist.plot1d(output['muon_pt'],overlay='dataset',stack=True)

### Histogram Scaling

When comparing a Monte-Carlo to Data, we need to scale the MC to the number of events we expect to see in a given amount of data.

$\text{N}_\text{expected} = \sigma \cdot L$

 - $\text{N}_\text{expected}$ = Number of events expected
 - $\sigma$ = cross section of a specific process
 - $L$ = integrated luminosity of data
 
In MC, we often generate far more events than we expect (for better statistical uncertainties), so we need rescale the MC distributions.  This is done by reweighting each MC dataset, where the weight applied is the ratio of the number of events expected to the number of events produced in the MC ($\text{N}_{MC}$)

$\Huge w = \frac{\text{N}_\text{MC}}{\text{N}_\text{expected}} = \frac{\text{N}_\text{MC}}{\sigma \cdot L}$

The number of events in MC and the cross section will change for each dataset

#### Cross sections
| Process | Cross Section (pb) |
| :--- | :---: |
| TTGamma (single lepton) | 7.509 |
| TTbar (single lepton) | 380.095 |
| WGamma | 489 |

#### Number of events

The $\text{N}_\text{MC}$ value should be the total number of 
Normally, in NanoAOD, you could keep track of the number of events are in the files that you process (tallying the total number of events in each sample, across all chunks processed).

However, since we are running on skims of the full MC sets, some of the events have already been removed.  However, in this case, we get the 


In [None]:
#the code below uses uproot to open a histogram in the root file, used to track the total
#  number of events processed while producing the skim

nEvents = {}
for d in fileset:
    if not d in nEvents:
        nEvents[d] = 0
    for fName in fileset[d]:
        with uproot.open(fName)['hEvents'] as hist:
            nEvents[d] += hist.values()[0] + hist.values()[2]


### Calculate weights

Make a dictionary, containing the weights to apply for each dataset in fileset
The dictionary should have the same key names as are in fileset, since these are what get used as the 'dataset' in the histogram axis.

The actual CMS data you are using corresponds to an integrated luminosity $L = X.XX \text{fb}^{-1}$ 

In [None]:
###############
## To Do
## Make new dictionary named weights, containing the luminosity and cross section based weights for each sample
###############



In [None]:
#loop over objects in the output, and scale them to the 

## Next Steps

### Z-boson selector

In this step, we're going to try to find events from a Z boson.

You are going to implement a selection looking for events with exactly two leptons that have opposite charge.  These leptons should pass the same 'tight' selections used in the previous notebook for selecting electrons and muons.

We are looking for events that have exactly two leptons of one flavor (either two electrons, or two muons) and where the two leptons have opposite charge (one electron and one positron, or one muon and one antimuon).

Then, make the following plots:
 - $p_T$ of the leading muon in the event
 - $p_T$ of the leading electron in the event
 - Mass of the combination of the two leptons
 - Difference between the two leptons in eta
 - Difference between the two leptons in phi
 - Difference between the two leptons in R


In [None]:
class Zselector(processor.ProcessorABC):
    def __init__(self):
        ### This function is where the histograms are defined and any other initialization happens
        
        ### Muon pt
        #Declare an axis for the dataset
        dataset_axis = hist.Cat("dataset","Dataset")
        
        #Declare an axis for the muon pt
        muon_pt_axis = hist.Bin("pt","$p_{T}$ [GeV]", 40, 0, 200)

        #Define the accumulator object, a dictionary storing all of the histograms and counters 
        #that we will fill later in the process function
        self._accumulator = processor.dict_accumulator({
            'muon_pt': hist.Hist("Counts", dataset_axis, muon_pt_axis),
        }
        )

        
        
    @property
    def accumulator(self):
        return self._accumulator

    def process(self, events):
        ### The process function is where most of the work happens. As we'll see below, this is
        ### where the main analysis work happens (object cuts, event selections, filling histograms). 
        
        ## This gets us the accumulator dictionary we defined in init
        output = self.accumulator.identity()

        ## To access variables from the ntuples, use the "events" object
        ## The dataset name is part of events.metadata
        dataset = events.metadata['dataset']

        # Fill the muon_pt histogram using the tightMuons in events that pass our selection 
        # Note that ak.flatten() is required when filling a histogram to remove the jaggedness
        output['muon_pt'].fill(dataset=dataset,
                              pt=ak.flatten(tightMuons[eventSelection].pt))
        
        ######
        ### Step 4. Fill the ele_pt histogram you defined earlier
        ###### 
        
        
        ######
        
        
        
        return output

    def postprocess(self, accumulator):
        return accumulator