# Generating Files for Joins

Using SX generate parquet files for testing with join infrastructure. Also, using new SX frontend.

Constants:

In [1]:
# A Z to ee sample - Release 21
ds_name = r"mc16_13TeV:mc16_13TeV.361106.PowhegPythia8EvtGen_AZNLOCTEQ6L1_Zee.deriv.DAOD_PHYS.e3601_e5984_s3126_r10201_r10210_p5313"

What we'll need for imports

In [2]:
from servicex import ServiceXClient, RucioDatasetIdentifier, ResultFormat, DatasetGroup
from func_adl_servicex_xaodr22.event_collection import Event

# import logging
# logging.basicConfig(level=logging.DEBUG)

The base dataset creation:

In [3]:
sx = ServiceXClient(backend="test4")
did = RucioDatasetIdentifier(ds_name, num_files=10)
ds_raw = sx.func_adl_dataset(
    did, codegen="atlasr21", title="Zee", result_format=ResultFormat.parquet, item_type=Event
)

from func_adl_servicex_xaodr22 import calib_tools
# ds = calib_tools.apply_calibration(ds_raw, "PHYS") <this is what we should have>
ds = calib_tools.query_update(ds_raw, calib_tools.default_config("PHYSLITE"))

# Central Electron $p_T$

We'll grab all the central electron $p_T$ that are more than 25 GeV.

In [4]:
good_ele = ds.Select(
    lambda e: {
        "run": e.EventInfo("EventInfo").runNumber(),
        "event": e.EventInfo("EventInfo").eventNumber(),
        "good_ele": e.Electrons("Electrons").Where(lambda e: (e.pt() / 1000 > 25.0) and (abs(e.eta()) < 2.5)
        ),
    }
)

First, just the $p_T$, and then the $\eta$:

In [5]:
electron_pt = good_ele.Select(lambda e: {
    "run": e.run,
    "event": e.event,
    "pt": e.good_ele.Select(lambda ele: ele.pt()/1000.0),
})

In [6]:
electron_pt.as_signed_urls()

Output()

TransformedResults(hash='bbdc4655382fa28e16eb92f903db51def76f6e9c2bc9139730ee912c25cc9b0a', title='Zee', codegen='atlasr21', request_id='59aadc72-7348-4412-bfc5-0ab851009120', submit_time=datetime.datetime(2023, 8, 14, 20, 32, 54, 892279, tzinfo=datetime.timezone.utc), data_dir='C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120', file_list=['C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root___atlasxrootd-kit.gridka.de_1094__pnfs_gridka.de_atlas_disk-only_atlasdatadisk_rucio_mc16_13TeV_02_48_DAOD_PHYS.30899209._000003.pool.root.parquet', 'C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root___atlasxrootd-kit.gridka.de_1094__pnfs_gridka.de_atlas_disk-only_atlasdatadisk_rucio_mc16_13TeV_03_e6_DAOD_PHYS.30899209._000243.pool.root.parquet', 'C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root

In [7]:
electron_pt.as_files()

Output()

TransformedResults(hash='bbdc4655382fa28e16eb92f903db51def76f6e9c2bc9139730ee912c25cc9b0a', title='Zee', codegen='atlasr21', request_id='59aadc72-7348-4412-bfc5-0ab851009120', submit_time=datetime.datetime(2023, 8, 14, 20, 32, 54, 892279, tzinfo=datetime.timezone.utc), data_dir='C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120', file_list=['C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root___atlasxrootd-kit.gridka.de_1094__pnfs_gridka.de_atlas_disk-only_atlasdatadisk_rucio_mc16_13TeV_02_48_DAOD_PHYS.30899209._000003.pool.root.parquet', 'C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root___atlasxrootd-kit.gridka.de_1094__pnfs_gridka.de_atlas_disk-only_atlasdatadisk_rucio_mc16_13TeV_03_e6_DAOD_PHYS.30899209._000243.pool.root.parquet', 'C:/Users/gordo/AppData/Local/Temp/59aadc72-7348-4412-bfc5-0ab851009120/root___xcache.af.uchicago.edu_1094__root

In [8]:
electron_etaphi = good_ele.Select(lambda e: {
    "run": e.run,
    "event": e.event,
    "eta": e.good_ele.Select(lambda ele: ele.eta()),
    "phi": e.good_ele.Select(lambda ele: ele.phi()),
})

And the number of calorimeter clusters, along with $p_T$ for matching

In [9]:
electron_calo = good_ele.Select(lambda e: {
    "run": e.run,
    "event": e.event,
    "pt": e.good_ele.Select(lambda ele: ele.pt()/1000.0),
    "n_calo": e.good_ele.Select(lambda ele: ele.nCaloClusters()),
})

## Another slice of electrons

Grab $\eta$, $\phi$, and $p_T$ for electrons between 20 and 25 GeV, but central.

In [10]:
medium_ele = ds.Select(
    lambda e: {
        "run": e.EventInfo("EventInfo").runNumber(),
        "event": e.EventInfo("EventInfo").eventNumber(),
        "good_ele": e.Electrons("Electrons").Where(lambda e: (e.pt() / 1000 > 20.0) and (e.pt() / 1000 <= 25.0) and (abs(e.eta()) < 2.5)
        ),
    }
)

In [11]:
medium_ele_ptetaphi = medium_ele.Select(lambda e: {
    "run": e.run,
    "event": e.event,
    "pt": e.good_ele.Select(lambda ele: ele.pt()/1000.0),
    "eta": e.good_ele.Select(lambda ele: ele.eta()),
    "phi": e.good_ele.Select(lambda ele: ele.phi()),
    })

## The Missing $E_T$

This is an event level variable, so there is only one of these.

There is currently a bug, so this isn't working yet!

In [12]:
# missing_et = ds.Select(lambda e: {
#     "run": e.EventInfo("EventInfo").runNumber(),
#     "event": e.EventInfo("EventInfo").eventNumber(),
#     "missing_et": e.MissingET().met()/1000.0,
# })

## Number of interactions per crossing

This is another thing where there is a single item per event, so it can take the place of missing $E_T$ until that is fixed.

In [13]:
crossings = ds.Select(lambda e: {
    "run": e.EventInfo("EventInfo").runNumber(),
    "event": e.EventInfo("EventInfo").eventNumber(),
    "interactions": e.EventInfo("EventInfo").actualInteractionsPerCrossing(),
})

## Fetch it all

Now that we have build all the queries, lets fetch them!

In [25]:
everything = {
    "electron_pt": electron_pt,
    "electron_etaphi": electron_etaphi,
    "medium_ele_ptetaphi": medium_ele_ptetaphi,
    # "missing_et": missing_et,
    "interactions": crossings,
    "n_calo": electron_calo,
}

Use the dataset group to get them all to show up, and then rebuild the results into a dictionary so we can easily print out the information we need.

In [15]:
# list_of_everything = [(k,v) for k,v in everything.items()]
# results = DatasetGroup([v[1].as_signed_urls() for v in list_of_everything]).gather_results()


# r_everything = {k:v for k,v in zip([v[0] for v in list_of_everything], results)}

In [16]:
everything.keys()
# list_of_everything = [(k,v) for k,v in everything.items()]
# results = DatasetGroup([v[1].as_signed_urls() for v in list_of_everything]).gather_results()
# r_everything = {k:v for k,v in zip([v[0] for v in list_of_everything], results)}

dict_keys(['electron_pt', 'electron_etaphi', 'medium_ele_ptetaphi', 'crossings', 'n_calo'])

In [20]:
r_everything = {}
async def doit(name):
    r_everything[name] = everything[name].as_signed_urls()

In [21]:
await doit('electron_pt')

Output()

In [22]:
await doit('electron_etaphi')

Output()

In [23]:
await doit('medium_ele_ptetaphi')

Output()

In [26]:
await doit('interactions')

Output()

In [27]:
await doit('n_calo')

Output()

In [28]:
from rich.table import Table

t = Table(title="Zee")
t.add_column("Name")
t.add_column("Request ID")
t.add_column("# Files")

for k,v in r_everything.items():
    t.add_row(k, v.request_id, str(len(v.signed_url_list)))

from rich.console import Console
console = Console()
console.print(t)