In [1]:
import json
import math
import os
import time
import pickle as pkl

import numpy as np
import torch

import sklearn
import sklearn.metrics

import matplotlib
import matplotlib.pyplot as plt
import mplhep as hep

plt.style.use(hep.style.CMS)
plt.rcParams.update({"font.size": 20})

class print_color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# import relevant functions from mlpf.pyg
import sys
sys.path.append("/home/jovyan/particleflow/mlpf/pyg/")

from PFDataset import PFDataset, DataLoader, Collater, InterleavedIterator
from utils import unpack_predictions, unpack_target, X_FEATURES, Y_FEATURES

In [4]:
# define the global base device
world_size = 1
if torch.cuda.device_count():
    device = torch.device("cuda:0")
    print(f"Will use {torch.cuda.get_device_name(device)}")
else:
    device = "cpu"
    print("Will use cpu")

Will use NVIDIA RTX A4000


# CLIC dataset

In [5]:
! ls /pfvol/tensorflow_datasets/clic*

/pfvol/tensorflow_datasets/clic_edm_qq_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ttbar_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ttbar_pu10_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_ww_fullhad_pf:
1.5.0

/pfvol/tensorflow_datasets/clic_edm_zh_tautau_pf:
1.5.0


In [6]:
data_dir = "/pfvol/tensorflow_datasets/"

In [7]:
def get_dataloaders(dataconfig, batch_size=10, num_workers=0, prefetch_factor=None):    
    """
    Takes a dataconfig [dict] and return respective dataloaders 
    for both VICreg (train/test) and downstream (train/test).
    
    Returns
        loader [dict]: {
                        "Vicreg": {"train": DataLoader, "test": DataLoader},
                        "downstream": {"train": DataLoader, "test": DataLoader},
                        } 
    """
    
    if (num_workers > 0) and (prefetch_factor is None):
        prefetch_factor = 2  # default prefetch_factor when num_workers>0

    
    loader = {
        "VICReg": {
            "train": [],
            "test": [],
        },
        "downstream": {
            "train": [],
            "test": [],
        },
    }

    print(print_color.BOLD + "VICReg dataset:" + print_color.END)    
    for split in ["train", "test"]:
        print(f"\t{split} dataset")
        for physics_sample in dataconfig["VICReg"][split]:
            dataset = PFDataset(data_dir, 
                           f"{physics_sample}:1.5.0", 
                           split, 
                           ["X", "ygen"],
                          )
            ds = torch.utils.data.Subset(dataset.ds, 
                                         range(0,
                                               dataconfig["VICReg"][split][physics_sample]
                                              ),
                                        )
            print(f"\t\t{physics_sample}: {len(ds)}")

            loader["VICReg"][split].append(
                DataLoader(
                    ds,
                    batch_size=batch_size,
                    collate_fn=Collater(["X", "ygen"]),
                    sampler=torch.utils.data.RandomSampler(ds),
                    num_workers=num_workers,
                    prefetch_factor=prefetch_factor,                   
                )
            )

        loader["VICReg"][split] = InterleavedIterator(loader["VICReg"][split])

    # for downstream
    print(print_color.BOLD + "Downstream dataset:" + print_color.END)    
    for split in ["train", "test"]:
        print(f"\t{split} dataset")
        loader_ = []
        for physics_sample in dataconfig["VICReg"][split]:

            dataset = PFDataset(data_dir, 
                           f"{physics_sample}:1.5.0", 
                           split, 
                           ["X", "ygen"],
                          )
            ds = torch.utils.data.Subset(dataset.ds, 
                                         range(dataconfig["VICReg"][split][physics_sample],
                                               dataconfig["VICReg"][split][physics_sample] +
                                               dataconfig["downstream"][split][physics_sample]
                                              ),
                                        )
            print(f"\t\t{physics_sample}: {len(ds)}")        

            loader_.append(
                DataLoader(
                    ds,
                    batch_size=batch_size,
                    collate_fn=Collater(["X", "ygen"]),
                    sampler=torch.utils.data.RandomSampler(ds),
                    num_workers=num_workers,
                    prefetch_factor=prefetch_factor,
                )
            )

        loader["downstream"][split] = InterleavedIterator(loader_)

    return loader

In [8]:
### define the physics samples, batch_size and number of samples you want to use below

dataconfig = {
    ### for VICReg
    "VICReg": {
        "train": {
            "clic_edm_qq_pf": 1_000,
            "clic_edm_ttbar_pf": 1_000,
        },
        "test": {
            "clic_edm_qq_pf": 1_000,
            "clic_edm_ttbar_pf": 1_000,
        },    
    },
    ### for downstream
    "downstream": {
        "train": {
            "clic_edm_qq_pf": 1_000,
            "clic_edm_ttbar_pf": 1_000,
        },
        "test": {
            "clic_edm_qq_pf": 1_000,
            "clic_edm_ttbar_pf": 1_000,
        },   
    },    
}

loader = get_dataloaders(dataconfig, batch_size=10)

[1mVICReg dataset:[0m
	train dataset
		clic_edm_qq_pf: 1000
		clic_edm_ttbar_pf: 1000
	test dataset
		clic_edm_qq_pf: 1000
		clic_edm_ttbar_pf: 1000
[1mDownstream dataset:[0m
	train dataset
		clic_edm_qq_pf: 1000
		clic_edm_ttbar_pf: 1000
	test dataset
		clic_edm_qq_pf: 1000
		clic_edm_ttbar_pf: 1000


In [9]:
loader   # notice that it has keys VICreg,downstream where each hold keys train,test

{'VICReg': {'train': <PFDataset.InterleavedIterator at 0x7f4a3d8ade70>,
  'test': <PFDataset.InterleavedIterator at 0x7f4a3d8aefe0>},
 'downstream': {'train': <PFDataset.InterleavedIterator at 0x7f4a3d92e8f0>,
  'test': <PFDataset.InterleavedIterator at 0x7f4a3d9480d0>}}

In [10]:
# let's retrieve a batch from the loader corresponding to VICReg, train
for batch in loader["VICReg"]["train"]:
    print(f"A single batch: {batch}")
    break

A single batch: DataBatch(X=[598, 17], ygen=[598, 8], batch=[598], ptr=[11])


In [11]:
# we can see the 17th features here (recall type is 1 for tracks and 2 for clusters)
X_FEATURES["clic"]

['type',
 'pt | et',
 'eta',
 'sin_phi',
 'cos_phi',
 'p | energy',
 'chi2 | position.x',
 'ndf | position.y',
 'dEdx | position.z',
 'dEdxError | iTheta',
 'radiusOfInnermostHit | energy_ecal',
 'tanLambda | energy_hcal',
 'D0 | energy_other',
 'omega | num_hits',
 'Z0 | sigma_x',
 'time | sigma_y',
 'Null | sigma_z']

In [12]:
# we can see the 8 gen features per pf element here (notice the jet_index which may be useful)
Y_FEATURES

['cls_id', 'charge', 'pt', 'eta', 'sin_phi', 'cos_phi', 'energy', 'jet_idx']

# Augmentations
Define several augmentations (start with tracks vs clusters)

# VICReg training
Show loss plots (invariance, variance, covariance)

# Downstream training
Train an MLPF on top of (1) trained VICreg, and (2) a randomly initialized VICReg

# Results
Comparing the downstream results of randomly initialized VICReg against a trained VICReg