In [None]:
# Remove input cells at runtime (nbsphinx)
import IPython.core.display as d
d.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# Energy reconstruction (MODEL)

**Recommended datasample(s):** model file, train and test data produced with ``protopipe-MODEL``

**Data level(s):** DL1b (telescope-wise image parameters) + DL2 (only shower geometry)

**Description:**

It should be used to test the performance of the trained model **before** use it to estimate the energy of DL2 events.

In fact, what happens in a *protopipe* analysis is that part of the TRAINING sample can be used for *testing* the models to get some preliminary diagnostics (i.e. before launching the much heavier DL2 production).

This notebook shows a camera-wise preliminary diagnostics.

Settings and setup of the plots are done using the same configuration file used for training the model.

**Requirements and steps to reproduce:**

- produce the model with ``protopipe-MODEL``

- Execute the notebook ``protopipe-BENCHMARK``,

``protopipe-BENCHMARK launch --config_file configs/benchmarks.yaml -n TRAINING/benchmarks_MODELS_energy``

To obtain the list of all available parameters add ``--help-notebook``.

**Developers**  

Please, if you have any contribution regarding this part, do it here and not in the relevant sections of the main code, which are now discontinued (they could be migrated back into ``protopipe.mva`` or in another place when more stable).

## Table of contents

* [Feature importance](#Feature-importance)
* [Feature distributions](#Feature-distributions)
* [Migration distribution](#Migration-distribution)
* [Energy resolution and bias](#Energy-resolution-and-bias)

## Imports
[back to top](#Table-of-contents)

In [None]:
import gzip
import glob
from pathlib import Path
import pickle
import joblib
import yaml

import numpy as np
import pandas as pd
import astropy.units as u
from scipy.stats import binned_statistic

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
plt.rcParams.update({'figure.max_open_warning': 0})
from matplotlib.pyplot import rc
import matplotlib.style as style
from cycler import cycler

from protopipe.pipeline.io import load_config, load_obj
from protopipe.benchmarks.utils import string_to_boolean
from protopipe.benchmarks.operations import get_evt_subarray_model_output
from protopipe.benchmarks.plot import plot_hist, plot_distributions, RegressorDiagnostic

## Load models
[back to top](#Table-of-contents)

In [None]:
analyses_directory = None
analysis_name = None
model_configuration_filename = None # Name of the configuration file of the model
output_directory = Path.cwd() # default output directory for plots
use_seaborn=False

In [None]:
# Handle boolean variables (papermill reads them as strings)
[use_seaborn] = string_to_boolean([use_seaborn])

In [None]:
# Check that the model configuration file has been defined
# either from the CLI of from the benchmarks configuration file (default)
if model_configuration_filename is None:
    try:
        model_configuration_filename = model_configuration_filenames["energy"]
    except KeyError:
        raise ValueError("The name of the configuration file is undefined.")

In [None]:
analysis_configuration_path = Path(analyses_directory) / analysis_name / Path("configs/analysis.yaml")
model_configuration_path = Path(analyses_directory) / analysis_name / "configs" / model_configuration_filename
input_directory = Path(analyses_directory) / analysis_name / Path("estimators/energy_regressor")

In [None]:
# Load configuration files
ana_cfg = load_config(analysis_configuration_path)
cfg = load_config(model_configuration_path)

# Get info from configs
estimation_weight = ana_cfg["EnergyRegressor"]["estimation_weight"]
model_type = "regressor"
method_name = cfg["Method"]["name"].split(".")[-1]
is_target_log = cfg["Method"]["log_10_target"]

In [None]:
cameras = [model.split('/')[-1].split('_')[1] for model in glob.glob(f"{input_directory}/{model_type}*.pkl.gz")]
data = {camera : dict.fromkeys(["model", "data_scikit", "data_train", "data_test"]) for camera in cameras} 

for camera in cameras:

    data[camera]["data_scikit"] = load_obj(
                glob.glob(f"{input_directory}/data_scikit_{model_type}_{method_name}_{camera}.pkl.gz")[0]
                )
    data[camera]["data_train"] = pd.read_pickle(
        glob.glob(f"{input_directory}/data_train_{model_type}_{method_name}_{camera}.pkl.gz")[0]
                )
    data[camera]["data_test"] = pd.read_pickle(
        glob.glob(f"{input_directory}/data_test_{model_type}_{method_name}_{camera}.pkl.gz")[0]
    )
    
    modelName = f"{model_type}_*_{camera}_{method_name}.pkl.gz"
    data[camera]["model"] = joblib.load(glob.glob(f"{input_directory}/{model_type}_{camera}_{method_name}.pkl.gz")[0])

## Settings and setup
[back to top](#Table-of-contents)

In [None]:
# Energy (both true and reconstructed)
nbins = cfg["Diagnostic"]["energy"]["nbins"]
energy_edges = np.logspace(
        np.log10(cfg["Diagnostic"]["energy"]["min"]),
        np.log10(cfg["Diagnostic"]["energy"]["max"]),
        nbins + 1,
        True,
    )

In [None]:
features_basic = cfg["FeatureList"]["Basic"]
features_derived = cfg["FeatureList"]["Derived"]
features = features_basic + list(features_derived)
features = sorted(features)

In [None]:
diagnostic = dict.fromkeys(cameras)
for camera in cameras:
    diagnostic[camera] = RegressorDiagnostic(
                    model=data[camera]["model"],
                    feature_name_list=features,
                    target_name="true_energy",
                    is_target_log=is_target_log,
                    data_train=data[camera]["data_train"],
                    data_test=data[camera]["data_test"],
                    output_name="reco_energy",
                    estimation_weight=estimation_weight
                )

In [None]:
# First we check if a _plots_ folder exists already.  
# If not, we create it.
plots_folder = Path(output_directory) / "plots"
plots_folder.mkdir(parents=True, exist_ok=True)

# Plot aesthetics settings

style.use(matplotlib_settings["style"])
cmap = matplotlib_settings["cmap"]

if matplotlib_settings["style"] == "seaborn-colorblind":
    
    colors_order = ['#0072B2', '#D55E00', '#F0E442', '#009E73', '#CC79A7', '#56B4E9']
    rc('axes', prop_cycle=cycler(color=colors_order))

if use_seaborn:
    import seaborn as sns

    sns.set_theme(context=seaborn_settings["theme"]["context"] if "context" in seaborn_settings["theme"] else "talk",
                  style=seaborn_settings["theme"]["style"] if "style" in seaborn_settings["theme"] else "whitegrid",
                  palette=seaborn_settings["theme"]["palette"] if "palette" in seaborn_settings["theme"] else None,
                  font=seaborn_settings["theme"]["font"] if "font" in seaborn_settings["theme"] else "Fira Sans",
                  font_scale=seaborn_settings["theme"]["font_scale"] if "font_scale" in seaborn_settings["theme"] else 1.0,
                  color_codes=seaborn_settings["theme"]["color_codes"] if "color_codes" in seaborn_settings["theme"] else True
                  )
    
    sns.set_style(seaborn_settings["theme"]["style"], rc=seaborn_settings["rc_style"])
    sns.set_context(seaborn_settings["theme"]["context"],
                    font_scale=seaborn_settings["theme"]["font_scale"] if "font_scale" in seaborn_settings["theme"] else 1.0)

## Feature importance
[back to top](#Table-of-contents)

In [None]:
for camera in cameras:
    plt.figure(figsize=(6, 5))
    ax = plt.gca()
    ax = diagnostic[camera].plot_feature_importance(
        ax,
        **{"alpha": 0.7, "edgecolor": "black", "linewidth": 2, "color": "darkgreen"}
    )
    ax.set_ylabel("Feature importance")
    ax.grid()
    plt.title(camera)
    plt.tight_layout()
    plt.savefig(plots_folder / f"energy_model_feature_importance_{camera}_protopipe_{analysis_name}.png")

## Feature distributions
[back to top](#Table-of-contents)

In [None]:
for camera in cameras:

    fig, axes = diagnostic[camera].plot_features(
        suptitle=camera,
        data_list=[data[camera]["data_train"], data[camera]["data_test"]],
        nbin=30,
        hist_kwargs_list=[
            {
                "edgecolor": "blue",
                "color": "blue",
                "label": "Gamma training",
                "alpha": 0.2,
                "fill": True,
                "ls": "-",
                "lw": 2,
            },
            {
                "edgecolor": "blue",
                "color": "blue",
                "label": "Gamma test",
                "alpha": 1,
                "fill": False,
                "ls": "--",
                "lw": 2,
            },
        ],
        error_kw_list=[
            dict(ecolor="blue", lw=2, capsize=2, capthick=2, alpha=0.2),
            dict(ecolor="blue", lw=2, capsize=2, capthick=2, alpha=0.2),
        ],
        ncols=5,
    )
    plt.savefig(plots_folder / f"energy_model_feature_distributions_{camera}_protopipe_{analysis_name}.png")

## Migration distribution
[back to top](#Table-of-contents)

**WARNING:** here we naively applying a gaussian fit for each slice in true energy, but it is not the best way especially for the lowest and highest bins (because of intrisinc tails in the distributions and lower statistics respectively).

In [None]:
for camera in cameras:

    ncols = 5
    nrows = (
        int(nbins / ncols) if nbins % ncols == 0 else int((nbins + 1) / ncols)
    )
    if nrows == 0:
        nrows = 1
        ncols = 1
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * 5, 10))
    plt.suptitle(camera)
    try:
        axes = axes.flatten()
    except:
        axes = [axes]

    bias = []
    resolution = []
    energy_centres = []

    for ibin in range(len(energy_edges) - 1):
        ax = axes[ibin]
        
        test_data = diagnostic[camera].data_test.query(
            "true_energy >= {} and true_energy < {}".format(
                energy_edges[ibin], energy_edges[ibin + 1]
            )
        )
        
        # print("Estimate energy for {} evts".format(len(test_data)))

        er = test_data["reco_energy_tel"]
        emc = test_data["true_energy"]

        opt_hist = {
            "edgecolor": "black",
            "color": "darkgreen",
            "label": "data",
            "alpha": 0.7,
            "fill": True,
        }
        opt_fit = {"c": "red", "lw": 2, "label": "Best fit"}
        ax, fit_param, cov = diagnostic[camera].plot_resolution_distribution(
            ax=ax,
            y_true=emc,
            y_reco=er,
            nbin=50,
            fit_range=[-2, 2],
            hist_kwargs=opt_hist,
            fit_kwargs=opt_fit,
        )
        if fit_param[2] < 0:  # negative value are allowed for the fit
            fit_param[2] *= -1

        label = "[{:.2f},{:.2f}] TeV\n#Evts={}\nmean={:.2f}\nstd={:.2f}".format(
            energy_edges[ibin],
            energy_edges[ibin + 1],
            len(er),
            fit_param[1],
            fit_param[2],
        )

        ax.set_ylabel("# Events")
        ax.set_xlabel("(E_reco - E_true) / E_true")
        ax.set_xlim([-2, 2])
        ax.grid()

        evt_patch = mpatches.Patch(color="white", label=label)
        data_patch = mpatches.Patch(color="blue", label="data")
        fit_patch = mpatches.Patch(color="red", label="best fit")
        ax.legend(loc="best", handles=[evt_patch, data_patch, fit_patch])
        plt.tight_layout()

        #print(
        #    " Fit results: ({:.3f},{:.3f} TeV)".format(
        #        energy_edges[ibin], energy_edges[ibin + 1]
        #    )
        #)

        #try:
        #    print(" - A    : {:.3f} +/- {:.3f}".format(fit_param[0], cov[0][0]))
        #    print(" - mean : {:.3f} +/- {:.3f}".format(fit_param[1], cov[1][1]))
        #    print(" - std  : {:.3f} +/- {:.3f}".format(fit_param[2], cov[2][2]))
        #except:
        #    print(" ==> Problem with fit, no covariance...".format())
        #    continue

        bias.append(fit_param[1])
        resolution.append(fit_param[2])
        energy_centres.append(
            (energy_edges[ibin] + energy_edges[ibin + 1]) / 2.0
        )

    plt.savefig(plots_folder / f"energy_model_migration_distributions_{camera}_protopipe_{analysis_name}.png")

## Energy resolution and bias
[back to top](#Table-of-contents)

In [None]:
true_energy_bins_edges = np.linspace(
        np.log10(cfg["Diagnostic"]["energy"]["min"]),
        np.log10(cfg["Diagnostic"]["energy"]["max"]),
        nbins + 1,
    ) * u.TeV

true_energy_bins_centers = 0.5 * (true_energy_bins_edges[1:]+true_energy_bins_edges[:-1])

for camera in cameras:
    
    plt.figure(figsize=(9,7))

    reco_energy = diagnostic[camera].data_test[diagnostic[camera].output_name_img]
    true_energy = diagnostic[camera].data_test[diagnostic[camera].target_estimation_name]
    
    resolution = binned_statistic(np.log10(true_energy),
                              reco_energy/true_energy - 1,
                              statistic = lambda x: np.percentile(np.abs(x), 68),
                              bins=true_energy_bins_edges)

    corr_resolution_mean = binned_statistic(np.log10(true_energy),
                                  reco_energy/true_energy - 1,
                                  statistic = lambda x: np.percentile(np.abs(x-np.mean(x)), 68),
                                  bins=true_energy_bins_edges)
    
    corr_resolution_median = binned_statistic(np.log10(true_energy),
                                  reco_energy/true_energy - 1,
                                  statistic = lambda x: np.percentile(np.abs(x-np.median(x)), 68),
                                  bins=true_energy_bins_edges)
    
    bias_mean = binned_statistic(np.log10(true_energy), 
                        reco_energy/true_energy - 1, 
                        statistic="mean", 
                        bins=true_energy_bins_edges)
    
    bias_median = binned_statistic(np.log10(true_energy), 
                        reco_energy/true_energy - 1, 
                        statistic="median", 
                        bins=true_energy_bins_edges)
    
    plt.plot(true_energy_bins_centers, resolution[0], label="resolution (bias included)")
    plt.plot(true_energy_bins_centers, corr_resolution_mean[0], label="resolution (bias mean corrected)")
    plt.plot(true_energy_bins_centers, corr_resolution_median[0], label="resolution (bias median corrected)")
    plt.plot(true_energy_bins_centers, bias_mean[0], label="bias (mean)")
    plt.plot(true_energy_bins_centers, bias_median[0], label="bias (median)")
    plt.title(camera)
    plt.legend()
    plt.grid()
    plt.ylim(-0.2, 1.2)
    plt.xlim(true_energy_bins_edges[0].value, true_energy_bins_edges[-1].value)
    plt.xlabel('log10(true energy) [TeV]')

    plt.savefig(plots_folder / f"energy_model_resolution_bias_{camera}_protopipe_{analysis_name}.png")