# Awkward Array toy for NanoAOD builder function

### ak.Array

In [1]:
import awkward as ak
import numpy as np
import pandas as pd

Defining a simple ak array

In [2]:
a = ak.Array([[1, 2, 3], [], [4, 5]])

ak.num gives us a way in which we can store information about the number of subentries

In [3]:
subs = ak.num(a)
print(subs)
sum = ak.sum(subs)
print(sum)

[3, 0, 2]
5


We can flatten and unflatten easily

In [4]:
flat = ak.flatten(a, axis=1) # flatten along the subentry axis
print(flat)

ak.unflatten(flat, subs)

[1, 2, 3, 4, 5]


We also can apply a mask over the ak.Array, using the entries substructure, just like the efficiency net output

In [5]:
flat_mask = np.array([True, False, False, False, True])

import numpy as np

np.random.seed(0)

x = np.random.rand(5)
y = np.random.rand(5)

flat_mask = x > y

print(flat_mask)

mask = ak.unflatten(flat_mask, subs)

print(mask)

[False  True False False  True]
[[False, True, False], [], [False, True]]


Then we have 2 ways in which we can apply the mask:
* using ak.mask, keeping `None` as a placeholder for masked-out entries 

In [6]:
ak.mask(a, mask)

* using the __getitem__() method, since it works as numpy

In [7]:
a[mask]

We can also concatenate event-wise

In [8]:
b = ak.Array([[1], [2, 3], []])

ak.concatenate([a, b], axis=1)

or just append the new events to the old array

In [9]:
ak.concatenate([a, b], axis=0)

### ak.Record aka awkward RDataFrames

#### Basic operations

Now we simulate a RDataFrame data structure and see how it works. ak.from_rdataframe() returns an ak.Array with the following structure:
```
[{event_0 all_vars}, {varX : [valX_0, valX_1, ...], varY : [valY_0, ...]}, {...}, ...]
```

In [10]:
awk = ak.Array(
    [
        {"Electron_pt": [21.6, 15], "Electron_eta": [0.108, 18], "Muon_pt": [15.8]},
        {"Electron_pt": [], "Electron_eta": [], "Muon_pt": [16.2]},
        {"Electron_pt": [56], "Electron_eta": [0.6], "Muon_pt": []},
    ]
)

awk.show(type=True)

type: 3 * {
    Electron_pt: var * float64,
    Electron_eta: var * float64,
    Muon_pt: var * float64
}
[{Electron_pt: [21.6, 15], Electron_eta: [0.108, 18], Muon_pt: [15.8]},
 {Electron_pt: [], Electron_eta: [], Muon_pt: [16.2]},
 {Electron_pt: [56], Electron_eta: [0.6], Muon_pt: []}]


We can select multiple columns

In [11]:
awk[["Electron_pt", "Electron_eta"]]

Entries structures

In [12]:
ev = ak.num(awk, axis=0) # "events"
print(ev)

sub = ak.num(awk, axis=1) # "subentries"
print(sub)

3
[{Electron_pt: 2, Electron_eta: 2, Muon_pt: 1}, ..., {Electron_pt: 1, ...}]


We cannot flatten these kind of structures, but we can for its content


In [13]:
pt = ak.flatten(awk["Muon_pt"], axis=1)
n = ak.num(awk["Muon_pt"], axis=1)
pt.show(type=True)

ak.unflatten(pt, n)

type: 2 * float64
[15.8,
 16.2]


The same holds for masking, since we are not able to unflatten to the entire structure

Converting to Pandas dataframe is easy, but we should pay attention on columns lenght, as always

In [60]:
awk = ak.Array(
    [
        {"Electron_pt": [21.6, 15, 45], "Electron_eta": [0.108, 18, 0.6]},
        {"Electron_pt": [], "Electron_eta": []},
        {"Electron_pt": [56, 12, 30], "Electron_eta": [0.6, 0.4, 0.3]},
    ]
)

awk = awk[awk.Electron_pt > 20]
awk.show()

df = ak.to_dataframe(awk)
print(df) # we lost the second event muon

# df = ak.to_dataframe(awk[["Muon_pt"]])
# # print(df)

n = ak.num(awk["Electron_pt"], axis=1)
print(n)

df = ak.to_dataframe(awk[["Electron_pt", "Electron_eta"]]).reset_index(drop=True)
# print(df)


[{Electron_pt: [21.6, 45], Electron_eta: [0.108, ...]},
 {Electron_pt: [], Electron_eta: []},
 {Electron_pt: [56, 30], Electron_eta: [0.6, 0.3]}]
                Electron_pt  Electron_eta
entry subentry                           
0     0                21.6         0.108
      1                45.0         0.600
2     0                56.0         0.600
      1                30.0         0.300
[2, 0, 2]


This is the kind of structure that we use in evaluating our models. To return back to the RDataframe the old script should work.

In [102]:
d = dict(zip(df.columns, df.values.T))
original = ak.zip(d)
# original = ak.Record(d)
# original = ak.Record(dict(zip(["Electron_pt", "Electron_eta"], df.values.T)))
# original = ak.unflatten(original, n)
original.show(limit_cols=1000)

# original = ak.unflatten(original, n)

for var in original.fields:
    print(original[var])
    original[var] = ak.unflatten(original[var], n, axis=1)

# original.show(limit_cols=1000)

# original.show()
# print(type(original))
# original = ak.unflatten(original, n)
# original.show(limit_cols=1000)
# print(type(original))
# original = original[ak.argsort(original.Electron_pt, axis=1, ascending=False)]
# original.show(limit_cols=1000)

[{Electron_pt: 21.6, Electron_eta: 0.108},
 {Electron_pt: 45, Electron_eta: 0.6},
 {Electron_pt: 56, Electron_eta: 0.6},
 {Electron_pt: 30, Electron_eta: 0.3}]
[21.6, 45, 56, 30]


ValueError: while calling

    ak.unflatten(
        array = <Array [21.6, 45, 56, 30] type='4 * float64'>
        counts = <Array [2, 0, 2] type='3 * int64'>
        axis = 1
        highlevel = True
        behavior = None
    )

Error details: structure imposed by 'counts' does not fit in the array or partition at axis=1

Concatenation:

In [16]:
awk_2 = ak.Array(
    [
        {"Electron_pt": [12, 40], "Electron_eta": [0.4, 0.5], "Muon_pt": []},
        {"Electron_pt": [20.0], "Electron_eta": [0.2], "Muon_pt": []},
        {"Electron_pt": [21.0], "Electron_eta": [0.3], "Muon_pt": []},
    ]
)

ak.concatenate([awk, awk_2], axis=1)

#### Merging into a single RDataFrame


Make a different Array, containing different objetcs

In [17]:
awk_2 = ak.Array([{"Jet_pt": [12, 23]}, {"Jet_pt": [22]}, {"Jet_pt": [25]}])

We merge two Arrays of Records resembling the RDataFrame structure by defining the following dictionary

In [18]:
dict_1 = dict(zip(awk.fields, [awk[field] for field in awk.fields]))
dict_2 = dict(zip(awk_2.fields, [awk_2[field] for field in awk_2.fields]))

total = dict_1 | dict_2
print(total)

{'Electron_pt': <Array [[21.6, 15], [], [56]] type='3 * var * float64'>, 'Electron_eta': <Array [[0.108, 18], [], [0.6]] type='3 * var * float64'>, 'Muon_pt': <Array [[15.8], [16.2], []] type='3 * var * float64'>, 'Jet_pt': <Array [[12, 23], [22], [25]] type='3 * var * int64'>}


and then make an awkward array

In [19]:
merged = ak.zip(total, depth_limit=1)
merged.show()

[{Electron_pt: [21.6, 15], Electron_eta: [0.108, 18], Muon_pt: [15.8], ...},
 {Electron_pt: [], Electron_eta: [], Muon_pt: [16.2], Jet_pt: [22]},
 {Electron_pt: [56], Electron_eta: [0.6], Muon_pt: [], Jet_pt: [25]}]


Since we have to loop over the fields of both old and new arrays, I do not know how much efficient this method is. 
Another way is to append the selected record, looping only on the columns to be appended

In [20]:
awk["Jet_pt"] = awk_2["Jet_pt"]
awk.show(type=True, limit_cols=1000)

type: 3 * {
    Electron_pt: var * float64,
    Electron_eta: var * float64,
    Muon_pt: var * float64,
    Jet_pt: var * int64
}
[{Electron_pt: [21.6, 15], Electron_eta: [0.108, 18], Muon_pt: [15.8], Jet_pt: [12, 23]},
 {Electron_pt: [], Electron_eta: [], Muon_pt: [16.2], Jet_pt: [22]},
 {Electron_pt: [56], Electron_eta: [0.6], Muon_pt: [], Jet_pt: [25]}]
