In [1]:
from __future__ import print_function
import functools
import numpy as np
import pandas as pd
import ROOT;
import root_numpy as rnp

Welcome to ROOTaaS 6.06/06


# Loading the input into numpy

Using root_numpy to make structured arrays.

In [100]:
#filename = '/Users/sfarrell/Atlas/xaod/mc15_13TeV.361023.Pythia8EvtGen_A14NNPDF23LO_jetjet_JZ3W.merge.DAOD_EXOT3.e3668_s2576_s2132_r7728_r7676_p2613/DAOD_EXOT3.08204445._000002.pool.root.1'
filename = '/Users/sfarrell/Atlas/xaod/mc15_13TeV.403554.MadGraphPythia8EvtGen_A14NNPDF23LO_GG_RPV10_1000_250.merge.DAOD_EXOT3.e5079_a766_a821_r7676_p2646/DAOD_EXOT3.08548063._000001.pool.root.1'

In [101]:
# Branch names to read in and rename for convenience
branchMap = {
    'CaloCalTopoClustersAuxDyn.calEta' : 'ClusEta',
    'CaloCalTopoClustersAuxDyn.calPhi' : 'ClusPhi',
    'CaloCalTopoClustersAuxDyn.calE' : 'ClusE',
    'AntiKt10LCTopoTrimmedPtFrac5SmallR20JetsAux.pt' : 'FatJetPt',
    'AntiKt10LCTopoTrimmedPtFrac5SmallR20JetsAux.eta' : 'FatJetEta',
    'AntiKt10LCTopoTrimmedPtFrac5SmallR20JetsAux.phi' : 'FatJetPhi',
    'AntiKt10LCTopoTrimmedPtFrac5SmallR20JetsAux.m' : 'FatJetM',
}

In [103]:
entries = rnp.root2array(filename, treename='CollectionTree',
                         branches=branchMap.keys(),
                         ) #start=0, stop=10000)
entries.dtype.names = branchMap.values()
print('Entries:', entries.size)

Entries: 9973


In [104]:
entries.dtype

dtype([('FatJetPhi', 'O'), ('FatJetEta', 'O'), ('FatJetM', 'O'), ('ClusEta', 'O'), ('ClusE', 'O'), ('ClusPhi', 'O'), ('FatJetPt', 'O')])

# Indexing and selection with numpy
Since the data is structured, we can index by key name and do some fancy stuff.

In [105]:
# Multiple ways to dump variables for a specific event.
# I'm actually surprised these both work.
print(entries[0]['FatJetPt'])
print(entries['FatJetPt'][0])

[ 556584.4375      295621.21875     201845.921875    161042.140625
   93007.1640625    71875.1484375    69584.0625       58491.34765625
   56537.53125      50488.59375      48581.41796875   45340.34375
   42120.62890625]
[ 556584.4375      295621.21875     201845.921875    161042.140625
   93007.1640625    71875.1484375    69584.0625       58491.34765625
   56537.53125      50488.59375      48581.41796875   45340.34375
   42120.62890625]


In [106]:
# Perform object selections on one event
event = entries[3]
event['FatJetPt'] > 300000

array([ True, False, False, False, False, False, False, False, False, False], dtype=bool)

In [107]:
# Select fatjets with pt > 200 GeV for all events in one go
f = np.vectorize(lambda jetPts: jetPts > 200000, otypes=[np.ndarray])
selectedJets = f(entries['FatJetPt'])
print(selectedJets)

[ array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False, False], dtype=bool)
 array([ True,  True,  True, False, False, False, False, False, False], dtype=bool)
 array([ True,  True,  True,  True, False, False, False, False, False,
       False, False], dtype=bool)
 ...,
 array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False], dtype=bool)
 array([ True,  True,  True,  True, False, False, False, False, False,
       False, False, False], dtype=bool)
 array([ True,  True, False, False, False, False, False], dtype=bool)]


In [108]:
# Select events with at least 2 selected jets
countSelected = np.vectorize(sum)
numJets = countSelected(selectedJets)
selectedEvents = numJets >= 2
print(numJets)
print(selectedEvents)

[3 3 4 ..., 4 4 2]
[ True  True  True ...,  True  True  True]


## Physics selections and variables

In [165]:
# units
GeV = 1e3
TeV = 1e6

# custom decorator vectorizes a function that returns an array
array_vectorize = functools.partial(np.vectorize, otypes=[np.ndarray])

@array_vectorize
def selectFatJets(fatJetPts, fatJetEtas):
    """Select jets out of numpy arrays of jet pt and eta"""
    #assert(fatJetPts.shape == fatJetEtas.shape)
    selPT = fatJetPts > 200*GeV
    selEta = np.fabs(fatJetEtas) < 2.
    return np.logical_and(selPT, selEta)

@np.vectorize
def selectBaselineEvents(selectedFatJets, fatJetPts):
    """Baseline event selection
    Selects events with at least 3 selected fat jets and requires the leading
    fat jet to have PT > 420 GeV to satisfy the trigger efficiency plateau.
    """
    selNFatJet = np.sum(selectedFatJets) >= 3
    selTrigAcc = np.max(fatJetPts) > 440*GeV
    return (selNFatJet and selTrigAcc)

@np.vectorize
def calculateSumJetMass(selectedFatJets, fatJetMs):
    """Calculates the summed jet mass of selected fat jets.
    For now, assuming jets are pre-sorted by decreasing PT.
    """
    return np.sum(fatJetMs[selectedFatJets][:4])

@np.vectorize
def dEtaLeadingFatJets(selectedFatJets, fatJetEtas):
    """Calculates |dEta| between leading two fat jets"""
    assert(selectedFatjets.size >= 2)
    eta1, eta2 = fatJetEtas[selectedFatJets][:2]
    return abs(eta1 - eta2)

@np.vectorize
def selectSignalEvents(isBaseline, selectedFatJets, summedMass, fatJetEtas):
    """Selects events passing signal region selection"""
    if not isBaseline:
        return False
    numFatJets = np.sum(selectedFatJets)
    eta1, eta2 = fatJetEtas[selectedFatJets][:2]
    if numFatJets < 4 or abs(eta1 - eta2) > 1.4:
        return False
    if numFatJets == 4:
        return (summedMass > 800*GeV)
    else: #numFatJets >= 4
        return (summedMass > 600*GeV)

In [166]:
# Test it out
selectedFatJets = selectFatJets(entries['FatJetPt'], entries['FatJetEta'])
baselineEvents = selectBaselineEvents(selectedFatJets, entries['FatJetPt'])
print('Baseline selected events: %d / %d' % (np.sum(baselineEvents), entries.size))

Baseline selected events: 7045 / 9973


In [167]:
# Calculate the summed jet mass for all events
summedMasses = calculateSumJetMass(selectedFatJets, entries['FatJetM'])
print(summedMasses[baselineEvents])

[  62587.33691406  641813.953125    688428.45507812 ...,  702532.22558594
  343132.76757812  495058.11132812]


In [168]:
signalEvents = selectSignalEvents(baselineEvents, selectedFatJets,
                                  summedMasses, entries['FatJetEta'])
print('Signal events: %d / %d' % (np.sum(signalEvents), entries.size))

Signal events: 254 / 9973


# Indexing and selection with pandas
I'm not sure this stuff is useful, but I'm probably not yet familiar enough with pandas.

In [11]:
df = pd.DataFrame.from_records(entries)
df

Unnamed: 0,FatJetPhi,FatJetEta,FatJetM,ClusEta,ClusE,ClusPhi,FatJetPt
0,"[0.00596846779808, 2.87110900879, -1.84715402126]","[-0.954404592514, -1.61063969135, 2.0010535717]","[61639.0390625, 10999.8671875, 13028.1142578]","[-1.6227, -0.943131, -0.923665, -1.02153, -1.5...","[498223.0, 134677.0, 74380.8, 52298.0, 45132.2...","[2.87337, 0.105046, 0.0983033, 0.0939832, 2.86...","[232080.96875, 221981.203125, 57221.3164062]"
1,"[0.66264218092, -2.33152222633, -2.80688095093]","[-1.68494164944, 1.09755623341, -0.360555291176]","[161539.8125, 105711.546875, 32847.0742188]","[-1.34636, -2.14971, 0.876069, -1.32248, -1.25...","[254646.0, 493897.0, 166191.0, 123737.0, 76182...","[0.713631, 0.563024, -2.13269, 0.772785, 0.723...","[380771.625, 267307.875, 83987.46875]"
2,"[2.68377804756, -1.07684326172, 0.519797444344...","[-0.257283210754, 1.31243598461, -2.1067028045...","[41038.1914062, 28292.6894531, 7954.24072266, ...","[-0.355851, 1.42148, -0.236781, -0.279007, 1.3...","[106985.0, 121855.0, 50254.8, 40061.0, 65259.4...","[2.69817, -1.14446, 2.54611, 2.6899, -1.10427,...","[343135.0, 144681.859375, 108444.523438, 96598..."
3,"[-2.94045710564, 0.313296169043]","[2.46938467026, 0.30729714036]","[33095.2695312, 27802.3574219]","[2.49492, 0.28934, 2.44046, 2.46279, 0.266242,...","[679644.0, 112411.0, 457538.0, 248921.0, 44433...","[-2.99868, 0.372021, -2.88921, -2.93311, 0.293...","[332282.34375, 264231.71875]"
4,"[-2.19297027588, 1.13155245781]","[-0.219372928143, -1.07450163364]","[43562.7539062, 13052.03125]","[-1.07233, -1.09638, -0.133159, -0.112744, -0....","[142078.0, 90911.1, 51356.1, 24854.0, 24746.2,...","[1.11448, 1.13409, -2.13287, -2.09358, -2.2086...","[217022.5625, 194112.046875]"
5,"[0.276071876287, -2.77678489685, 2.91632199287...","[-2.21385741234, 0.795198857784, -1.8358240127...","[78253.46875, 95443.828125, 20252.1992188, 156...","[-2.19028, 0.438731, -2.12773, 1.11994, -2.325...","[363842.0, 102581.0, 321279.0, 49539.6, 130510...","[0.371298, -2.89956, 0.354993, -2.82553, 0.351...","[343388.25, 257815.546875, 103880.03125, 48012..."
6,"[-2.29030370712, 0.937134921551, 0.752174854279]","[-1.16092443466, 2.77408599854, -1.18175673485]","[40867.0664062, 13770.5966797, 2712.07250977]","[2.7437, -1.16182, -1.14853, -1.13438, -1.1819...","[831665.0, 195261.0, 61551.7, 47485.0, 48263.6...","[0.924463, -2.34814, -2.34821, -2.37065, -2.30...","[271225.875, 143822.78125, 115238.101562]"
7,"[-2.82683777809, 0.138846471906, 0.996263563633]","[-1.69611680508, -0.994612157345, -2.65157485008]","[78389.4140625, 37778.734375, 17217.7460938]","[-0.968674, -1.56852, -1.52438, -1.59856, -1.0...","[297711.0, 326939.0, 160782.0, 82174.1, 36549....","[0.128344, -2.81806, -2.83585, -2.7594, 0.0807...","[343359.125, 306999.6875, 48254.609375]"
8,"[2.32210016251, -0.785404801369]","[0.899966597557, 1.38102436066]","[81008.109375, 49577.9414062]","[0.860151, 1.45404, 0.905788, 0.887156, 1.3249...","[269632.0, 273276.0, 97259.9, 67126.0, 48833.6...","[2.39106, -0.740536, 2.40817, 2.26278, -0.8403...","[343885.625, 293594.78125]"
9,"[0.454613238573, -2.60988664627]","[-1.22083210945, -0.0625708028674]","[45789.78125, 43100.296875]","[-1.21689, -0.110082, -1.16486, -0.0648095, -1...","[209207.0, 83647.7, 97040.7, 53129.9, 97680.5,...","[0.411506, -2.58741, 0.314041, -2.55891, 0.479...","[354046.125, 343449.78125]"


# Timing tests
Ignore these. I'm just timing some stuff.

In [131]:
fatJetMs = entries['FatJetM'][0]
%timeit np.sort(fatJetMs)[-4:]
%timeit fatJetMs[np.argpartition(fatJetMs, -4)[-4:]]

The slowest run took 16.97 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.09 µs per loop
The slowest run took 14.98 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.34 µs per loop


In [149]:
x = np.random.random(10000) - 0.5
%timeit np.absolute(x) > 0.1
%timeit np.logical_and(x > 0.1, x < -0.1)

The slowest run took 11.52 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 8.42 µs per loop
The slowest run took 17.07 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.56 µs per loop


In [90]:
%timeit entries[0]['FatJetPt']
%timeit entries['FatJetPt'][0]

The slowest run took 18.19 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 603 ns per loop
The slowest run took 14.15 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 219 ns per loop


In [89]:
def f1():
    for entry in entries:
        for jet in entry['FatJetPt']:
            pass
        
def f2():
    for jets in entries['FatJetPt']:
        for jet in jets:
            pass

%timeit f1()
%timeit f2()

100 loops, best of 3: 5.22 ms per loop
100 loops, best of 3: 3.28 ms per loop
