In [None]:
# Remove input cells at runtime (nbsphinx)
import IPython.core.display as d
d.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# Image intensity resolution (TRAINING)

**Recommended datasample(s):** ``gamma-1`` (used to build the energy model)

**Data level(s):** DL1b (telescope-wise image parameters) + DL2a (only shower geometry)

**Description:**

This benchmark checks how well the intensity parameter (total reconstructed charge in pixels surviving cleaning, i.e. the parameter traditionally known as image "Size") correlates with the true number of photoelectrons from Cherenkov photons in the whole camera (that is, in all pixels, regardless whether or not they survive cleaning).

This correlation is relevant, above all, for the energy reconstruction.

We do this check for the images which survive the following cuts:
- intensity > 50 phe,
- image c.o.g. within 80% of the camera radius,
- maximum impact parameter of 200 m.

**Data level:** DL1a + DL1b (telescope-wise true/reconstructed images and image parameters)

**Notes:**

Any bias present at calibration level (due for example to CALIB_SCALE and/or charge integration correction) as well as the fact that the charge in pixels rejected by the cleaning is not counted in the calculation of intensity will cause the intensity parameter to be systematically smaller than the actual number of photoelectrons.

The impact parameter cut takes into account charges which arrive after the end of the readout window due to large impact parameters.

The stronger bias seen for intensity values around ~2e4 is due to saturation at the pixel level.

**Requirements and steps to reproduce**

This notebook can be used with any file produced by ``protopipe-TRAINING`` (or its equivalent from the DIRAC Grid interface) with recorded images.

To get a filled notebook and reproduce these results,

- get the necessary input files using ``protopipe-TRAINING`` using the gamma1 sample and saving the images (see documentation)
- execute the notebook with ``protopipe-BENCHMARK``,

``protopipe-BENCHMARK launch --config_file benchmarks.yaml -n TRAINING/benchmarks_DL1_image_intensity_resolution``

To obtain the list of all available parameters add ``--help-notebook``.

**Comparison against CTAMARS:**

- use the whole ``gamma-1`` sample
- reference simtel-files, plots, values and settings can be found [here (please, always refer to the latest version)](https://forge.in2p3.fr/projects/benchmarks-reference-analysis/wiki/Comparisons_between_pipelines).

**Development and testing:**  

As with any other part of _protopipe_ and being part of the official repository, this notebook can be further developed by any interested contributor.   
The execution of this notebook is not currently automatic, it must be done locally by the user _before_ pushing a pull-request.  
Please, strip the output before pushing.

## Table of contents

- [Distributions](#Distributions)
- [Mean and RMS as error bar of intensity profile](#Mean-and-RMS-as-error-bar-of-intensity-profile)
- [RMS/Mean](#RMS/Mean)

## Imports

In [None]:
from pathlib import Path

import tables
import numpy as np
from scipy.stats import binned_statistic
from astropy.table import join
import astropy.units as u
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib.pyplot import rc
import matplotlib.style as style
from cycler import cycler

from ctapipe.io import read_table

from protopipe.pipeline.utils import CTAMARS_radii, load_config
from protopipe.benchmarks.operations import sum_of_squares, OnlineBinnedStats

## Input data
[back to top](#Table-of-contents)

In [None]:
analyses_directory = ""  # path to the 'analyses' folder
output_directory = Path.cwd() # default output directory for plots
analysis_name = "" # Name of the analysis stored in 'analyses_folder'
input_filename = "TRAINING_energy_tail_gamma_merged.h5"  # Name of the file produced with protopipe
nth_chunk = None # int, read up to the first chunk of 10l images (default: None, full dataset)
load_CTAMARS = True # Enable to compare the CTAN analysis done with CTAMARS (Release 2019)
use_seaborn = True

In [None]:
# Handle boolean variables (papermill reads them as strings)
[load_CTAMARS, use_seaborn] = string_to_boolean([load_CTAMARS, use_seaborn])

In [None]:
if not input_filename:
    try:
        input_filename = input_filenames["TRAINING_energy_gamma"]
    except (NameError, KeyError):
        raise ValueError("ERROR: input_filename undefined.")

if not Path(analyses_directory).is_dir():
    raise ValueError("ERROR: analyses_directory undefined or not existent.")

input_directory = Path(analyses_directory) / analysis_name / Path("data/TRAINING/for_energy_estimation")
input_file = input_directory / input_filename

In [None]:
if load_CTAMARS:
    
    import uproot
    
    # Get input file path
    if (input_directory_CTAMARS["parent_directory"] is None) or (input_directory_CTAMARS["TRAINING/DL1"] is None):
        raise ValueError("ERROR: CTAMARS data undefined. Please, check the documentation of protopipe-BENCHMARKS.")
    else:

        mars_dl1b_fileName = "check_dl1b.root"

        path_mars_dl1b = Path(input_directory_CTAMARS["parent_directory"]) / input_directory_CTAMARS["TRAINING/DL1"] / mars_dl1b_fileName
        ctamars_dl1b = uproot.open(path_mars_dl1b)

        CTAMARS_data = {}
        with uproot.open(path_mars_dl1b) as file:

            for key in file.keys():
                CTAMARS_data[key[:-2]] = file[key[:-2]]

In [None]:
# get camera names
with tables.open_file(input_file, 'r') as f:
    
    cameras = [cam.name for cam in f.root]

In [None]:
# setup the initial data

H = dict.fromkeys(cameras)
stats = dict.fromkeys(cameras)

for i, camera in enumerate(cameras):
    
    # Counts histogram
    
    if load_CTAMARS:
    
        x_bin_edges_counts = CTAMARS_data[f"PheOverSize_vs_Size_type{i}"].to_numpy()[1]
        y_bin_edges_counts = CTAMARS_data[f"PheOverSize_vs_Size_type{i}"].to_numpy()[2]

    else:

        x_bin_edges_counts = np.linspace(1.298, 5.298, 200)
        y_bin_edges_counts = np.linspace(0., 4., 200)
        
    x = np.zeros(len(x_bin_edges_counts)-1)
    y = np.zeros(len(x_bin_edges_counts)-1)
    
    H[camera], _, _ = np.histogram2d(x, y, bins=(x_bin_edges_counts, y_bin_edges_counts))
    
    # Profiled histogram
    
    if load_CTAMARS:
        x_bin_edges_profile = CTAMARS_data[f"relative_rms_{i}"].to_numpy()[1]
    else:
        x_bin_edges_profile = np.arange(1.298, 5.498, 0.2)
    
    stats[camera] = OnlineBinnedStats(x_bin_edges_profile)

In [None]:
with tables.open_file(input_file, 'r') as f:

    n_rows = {cam: len(f.root[cam]) for cam in cameras}
    print(f"Total number of images at input per camera = {n_rows}")
    
    chunksize = 10000
    
    for camera in cameras:
    
        n_chunks = int(np.ceil(n_rows[camera] / chunksize))
        
        for chunk in range(n_chunks):
            
            start = chunk * chunksize
            stop = (chunk + 1) * chunksize
            t = read_table(f, f'/{camera}', start=start, stop=stop)

            if load_CTAMARS:
                selected_images = t[(t["hillas_intensity"]>50.) & 
                                    (t["hillas_r"]<0.8*CTAMARS_radii(camera)) &
                                    (t["impact_dist"]<200.) &
                                    (t["image_extraction"]==1)]
                hillas_intensity = selected_images["hillas_intensity"]
                tot_true_phe = np.sum(selected_images["true_image"], axis=1)
            else:
                hillas_intensity = t["hillas_intensity"]
                tot_true_phe = np.sum(t["true_image"], axis=1)

            X = np.log10(hillas_intensity)
            Y = tot_true_phe / hillas_intensity
            
            ###### 2D histogram

            H[camera] += np.histogram2d(x = X, 
                                        y = Y,
                                        bins=(x_bin_edges_counts, y_bin_edges_counts))[0]
            
            ###### Profiled histogram
            
            stats[camera].update(X, Y)
            
            if nth_chunk and (chunk > int(n_chunks/nth_chunk)):
                break

In [None]:
# First we check if a _plots_ folder exists already.  
# If not, we create it.
plots_folder = Path(output_directory) / "plots"
plots_folder.mkdir(parents=True, exist_ok=True)

In [None]:
# Plot aesthetics settings

style.use(matplotlib_settings["style"])
cmap = matplotlib_settings["cmap"]

if matplotlib_settings["style"] == "seaborn-colorblind":
    
    colors_order = ['#0072B2', '#D55E00', '#F0E442', '#009E73', '#CC79A7', '#56B4E9']
    rc('axes', prop_cycle=cycler(color=colors_order))

if use_seaborn:
    import seaborn as sns

    sns.set_theme(context=seaborn_settings["theme"]["context"] if "context" in seaborn_settings["theme"] else "talk",
                  style=seaborn_settings["theme"]["style"] if "style" in seaborn_settings["theme"] else "whitegrid",
                  palette=seaborn_settings["theme"]["palette"] if "palette" in seaborn_settings["theme"] else None,
                  font=seaborn_settings["theme"]["font"] if "font" in seaborn_settings["theme"] else "Fira Sans",
                  font_scale=seaborn_settings["theme"]["font_scale"] if "font_scale" in seaborn_settings["theme"] else 1.0,
                  color_codes=seaborn_settings["theme"]["color_codes"] if "color_codes" in seaborn_settings["theme"] else True
                  )
    
    sns.set_style(seaborn_settings["theme"]["style"], rc=seaborn_settings["rc_style"])
    sns.set_context(seaborn_settings["theme"]["context"],
                    font_scale=seaborn_settings["theme"]["font_scale"] if "font_scale" in seaborn_settings["theme"] else 1.0)

## Distributions
[back to top](#Table-of-contents)

In [None]:
max_counts = {}
if load_CTAMARS:
    for i, camera in enumerate(cameras):
        max_counts[camera] = CTAMARS_data[f"PheOverSize_vs_Size_type{i}"].to_numpy()[0].max()
    else:
        max_counts[camera] = 700

for i, camera in enumerate(cameras):
    
    print(f"Total number of SELECTED {camera} images:")
    print(f"protopipe = {np.sum(H[camera])}")
    if load_CTAMARS:
        print(f"CTAMARS = {np.sum(CTAMARS_data[f'PheOverSize_vs_Size_type{i}'].to_numpy()[0])}")
    
    if not load_CTAMARS:
        plt.figure(figsize=(10,6))
        plt.title(camera)
    else:
        plt.figure(figsize=(10,4))
        plt.subplots_adjust(hspace=0.4)
        plt.subplot(1, 2, 1)
        plt.title(f"protopipe - {camera}")

    plt.pcolormesh(x_bin_edges_counts, 
                   y_bin_edges_counts, 
                   H[camera].T,
                   norm=LogNorm(vmax=max_counts[camera]),
                   cmap="viridis"
                   )
    cbar = plt.colorbar()
    cbar.set_label("# images")

    
    plt.xlabel("log10(intensity) [phe]")
    plt.ylabel("tot_true_phe / intensity")

    plt.grid(which="both", axis="both")
    
    if load_CTAMARS:
        plt.subplot(1, 2, 2)
        plt.title(f"CTAMARS - {camera}")
        
        plt.pcolormesh(x_bin_edges_counts, 
                       y_bin_edges_counts, 
                       CTAMARS_data[f"PheOverSize_vs_Size_type{i}"].to_numpy()[0].T,
                       norm=LogNorm(vmax=max_counts[camera]),
                       cmap="viridis"
                       )
        
        cbar = plt.colorbar()
        cbar.set_label("# images")

        plt.xlabel("log10(intensity) [phe]")
        plt.ylabel("tot_true_phe / intensity")

        plt.grid(which="both", axis="both")
    
    plt.savefig(plots_folder / f"correlation_intensity_trueCharge_distribution_{camera}_protopipe_{analysis_name}.png")
    
    plt.show()

## Mean and RMS as error bar of intensity profile
[back to top](#Table-of-contents)

In [None]:
for camera in cameras:
    
    plt.figure()
    
    plt.errorbar(x = stats[camera].bin_centers, 
                 y = stats[camera].mean, 
                 yerr=stats[camera].std, 
                 xerr=stats[camera].bin_width / 2,
                 fmt='go',
                 ecolor="g",
                 markersize=5)
    
    plt.title(camera)
    plt.xlim(np.min(x_bin_edges_counts),np.max(x_bin_edges_counts))
    plt.ylim(np.min(y_bin_edges_counts),np.max(y_bin_edges_counts))
    plt.xlabel("log10(intensity) [phe]")
    plt.ylabel("tot_true_phe / intensity")
    
    plt.grid(which="both", axis="both")
    
    plt.savefig(plots_folder / f"correlation_intensity_trueCharge_mean+RMSerrorbar_{camera}_protopipe_{analysis_name}.png")
    
    plt.show()

## RMS/Mean
[back to top](#Table-of-contents)

In [None]:
for i, camera in enumerate(cameras):
    
    plt.figure(figsize=(12,5))
    plt.subplots_adjust(hspace=0.3)
    plt.suptitle(camera)
    
    plt.subplot(1,2,1)
    
    plt.plot(stats[camera].bin_centers, stats[camera].std/stats[camera].mean, 'o', label="protopipe")
    
    if load_CTAMARS:
        plt.plot(stats[camera].bin_centers, CTAMARS_data[f"relative_rms_{i}"].to_numpy()[0], "o", label="CTAMARS")
    
    plt.xlabel("log10(intensity) [phe]")
    plt.ylabel("Relative RMS of (tot_true_phe / intensity)")
    plt.ylim(0,0.35)
    
    plt.grid(which="both", axis="both")
    
    plt.legend()
    
    plt.subplot(1,2,2)
    
    ratio = (stats[camera].std/stats[camera].mean) / CTAMARS_data[f"relative_rms_{i}"].to_numpy()[0]
    plt.plot(stats[camera].bin_centers, ratio)
    plt.xlabel("log10(intensity) [phe]")
    plt.ylabel("Ratio protopipe/CTAMARS")
    plt.ylim(0,2)
    plt.xlim(np.min(x_bin_edges_counts),np.max(x_bin_edges_counts))
    ax = plt.gca()
    xlims = ax.get_xlim()
    plt.hlines(1.0, xmin=xlims[0], xmax=xlims[1], ls="--", lw=2, color="green")
    
    plt.savefig(plots_folder / f"correlation_intensity_trueCharge_meanRMSratio_{camera}_protopipe_{analysis_name}.png")
    
    plt.show()