In [1]:
import json
import math
import os
import time
import pickle as pkl

import numpy as np
import torch

import sklearn
import sklearn.metrics

import matplotlib
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

In [2]:
# import relevant functions from mlpf.pyg
import sys
sys.path.append("/home/jovyan/particleflow/mlpf/pyg/")

from utils import PFDataset, InterleavedIterator
from utils import unpack_predictions, unpack_target, X_FEATURES, Y_FEATURES

In [3]:
# define the global base device
world_size = 1
if torch.cuda.device_count():
    device = torch.device("cuda:0")
    print(f"Will use {torch.cuda.get_device_name(device)}")
else:
    device = "cpu"
    print("Will use cpu")

Will use NVIDIA GeForce GTX 1080 Ti


# CLIC dataset

In [4]:
! ls /pfvol/tensorflow_datasets/clic*

/pfvol/tensorflow_datasets/clic_edm_qq_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ttbar_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ttbar_pu10_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ww_fullhad_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_zh_tautau_pf:
1.5.0


In [5]:
data_dir = "/pfvol/tensorflow_datasets/"

### define the physics samples, batch_size, and number of samples you want to use below
config = {
    "clic_edm_qq_pf": {
        "version": "1.5.0",
        "batch_size": 2,
        "num_samples": 10
    },
    "clic_edm_ttbar_pf": {
        "version": "1.5.0",
        "batch_size": 2,
        "num_samples": 10
    },  
}

train_loaders, dataset_size = [], 0
for physics_sample in config:
    version = config[physics_sample]["version"]
    batch_size = config[physics_sample]["batch_size"]
    num_samples = config[physics_sample]["num_samples"]

    ds = PFDataset(data_dir, f"{physics_sample}:{version}", "train", ["X", "ygen"], num_samples)
    train_loaders.append(ds.get_loader(batch_size, world_size))
    
    dataset_size += len(ds)

train_loader = InterleavedIterator(train_loaders)   # merge several loades of potentially different batch_sizes

In [6]:
print(f"num of clic events {dataset_size}")

num of clic events 20


In [7]:
for batch in train_loader:
    print(f"A single batch: {batch}")
    break

A single batch: DataBatch(X=[185, 17], ygen=[185, 8], batch=[185], ptr=[3])


In [8]:
# we can see the 17th features here (recall type is 1 for tracks and 2 for clusters)
X_FEATURES["clic"]

['type',
 'pt | et',
 'eta',
 'sin_phi',
 'cos_phi',
 'p | energy',
 'chi2 | position.x',
 'ndf | position.y',
 'dEdx | position.z',
 'dEdxError | iTheta',
 'radiusOfInnermostHit | energy_ecal',
 'tanLambda | energy_hcal',
 'D0 | energy_other',
 'omega | num_hits',
 'Z0 | sigma_x',
 'time | sigma_y',
 'Null | sigma_z']

In [9]:
# we can see the 8 gen features per pf element here (notice the jet_index which may be useful)
Y_FEATURES

['cls_id', 'charge', 'pt', 'eta', 'sin_phi', 'cos_phi', 'energy', 'jet_idx']

# Augmentations
Define several augmentations (start with tracks vs clusters)

# VICReg training
Show loss plots (invariance, variance, covariance)

# Downstream training
Train an MLPF on top of (1) trained VICreg, and (2) a randomly initialized VICReg

# Results
Comparing the downstream results of randomly initialized VICReg against a trained VICReg