In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import astropy.units as u
from astropy.coordinates.erfa_astrom import ErfaAstromInterpolator, erfa_astrom
from lstchain.reco.utils import get_effective_time, extract_source_position, compute_theta2
from ctapipe.containers import EventType
import gc

%matplotlib inline

## DL2 exploration notebook

This notebooks opens a set of LST-1 DL2 files, divides the data in two subsets according to reconstructed energy, and computes (for certain gammaness cuts) the theta2 plots with respect to a direction specified by the user (the candidate source).

The cuts (gammaness & theta2) used for computing significances are "reasonable" for a first attempt at source detection. The ones for the high-E subset are about optimal for sensitivity (at low zeniths, say below 40 deg, good observation conditions and a Crab-like spectrum). 
For low-energies it is hard to say what "optimal cuts" would be - that is quite dependent on the source energy spectrum, and also more sensitive to zenith angle (via the energy threshold). Do **not** play with the cuts on a yet-undetected source! If custom optimization of cuts is necessary, that can be done on simulations (for an assumed spectrum) or on a confirmed and bright source.

NOTE: the notebook is quite slow (due mainly to data loading and coordinate transformations) and memory-hungry, so you *may* have trouble with datasets longer than several tens of hours. For very long datasets, if you have problems with this notebook, you can do the theta2 plots from DL3 FITS files (using Gammapy).

## USER INPUT: dataset and source name

In [None]:
dataset = glob.glob("/fefs/aswg/workspace/abelardo.moralejo/Crab_test_school_2024/DL2/dl2*.h5")
source_name = "Crab" 
# theta2 plots will be calculated w.r.t. this source (name must be known to astropy, 
# and must be in the FoV for the selected dataset)

lowest_gammaness = 0.3 # events with lower gammaness will be skipped. Just to save memory!

In [None]:
dataset.sort()

In [None]:
tablename = "/dl2/event/telescope/parameters/LST_LSTCam"
dummy = []

t_eff = 0
t_elapsed = 0

# In order to save memory we read only the necessary columns from the DL2 table.
# If you need to access other parameters, just add their names below.
needed_columns = ['dragon_time', 'alt_tel', 'az_tel',
                  'reco_src_x', 'reco_src_y', 'gammaness',
                  'intensity', 'reco_energy', 'event_type']


for file in dataset:
    print(file)
    tb = pd.read_hdf(file, tablename)
    lt, et = get_effective_time(tb)
    t_eff += lt
    t_elapsed += et

    # Reduce precision to save memory:
    for colname in needed_columns:
        if colname == 'dragon_time': # here we need float64, keep it.
            continue
        if tb[colname].dtype != 'float64':
            continue
        tb[colname] = tb[colname].astype('float32')
    
    dummy.append(tb[needed_columns][tb['gammaness']>lowest_gammaness])
    tb = None
    gc.collect() # free memory
    
table = pd.concat(dummy)

In [None]:
dummy = None
gc.collect() # free memory (in case of long table)

In [None]:
print('Number of events:', len(table))

In [None]:
print(f'Size of table (KB): {table.__sizeof__()/1024:.0f}')

In [None]:
print(table.columns)

In [None]:
print(f'Effective time: {t_eff.to(u.h):.3f};  Elapsed time: {t_elapsed.to(u.h):.3f}')

In [None]:
# Set the cuts here. 
# We make two subsets: Ereco < 0.2 TeV and Ereco > 0.2 TeV

min_gammaness_cut = [0.5, 0.95] # Note the table already has a prior cut gness>lowest_gammaness! (defined above)
min_intensity_cut = [50, 50] # p.e.
min_energy_cut = [0., 0.2] # TeV
max_energy_cut = [0.2, 1e6] # TeV

theta2_cut = [0.04, 0.02] # deg2 - this one is applied later, by adding contents of theta2 histograms

event_selection = [] # Index 0 will contain low-E cuts, index 1 the high-E cuts
for k in range(2):
    event_selection.append((table.gammaness > min_gammaness_cut[k]) &
                           (table.intensity > min_intensity_cut[k]) &
                           (table.reco_energy > min_energy_cut[k]) &
                           (table.reco_energy < max_energy_cut[k]) &
                           (table.event_type == EventType.SUBARRAY.value))
# SUBARRAY is the event type for "cosmics" (i.e. "physics trigger", showers)

In [None]:
gamma_candidates = []
for k in range(2):
    gamma_candidates.append(table[event_selection[k]])

In [None]:
focal = 29.30565 * u.m  # EFFECTIVE focal length (i.e. accounts for coma aberration)

source_position = []
# Beware: this can be quite slow for long datasets (and the more so the softer the event selection cuts!)
for k in range(2):
    with erfa_astrom.set(ErfaAstromInterpolator(5 * u.min)):
        source_position.append(extract_source_position(gamma_candidates[k], 
                                                       source_name, 
                                                       equivalent_focal_length=focal))

In [None]:
nbins = [50, 100] # number of bins of theta2 plot for low E and high E
number_of_offs = [1, 3] # number of off regions (just one at low E, because of worse angular resolution)

for k in range(2): # index 0 is low E (defined above);  index 1 is high E
    print("\n\n\n")
    
    # Off positions: the one opposite to the source w.r.t. the camera center, plus two more 
    # at the same distance from the center "at right angles" w.r.t. to the line source -camera_center:
    off_180 = [-source_position[k][0], -source_position[k][1]]
    off_90 = [-source_position[k][1], source_position[k][0]]
    off_270 = [source_position[k][1], -source_position[k][0]]

    theta2_on = np.array(compute_theta2(gamma_candidates[k], source_position[k]))
    theta2_off_180 = np.array(compute_theta2(gamma_candidates[k], off_180))
    theta2_off_90 = np.array(compute_theta2(gamma_candidates[k], off_90))
    theta2_off_270 = np.array(compute_theta2(gamma_candidates[k], off_270))

    theta_range = (0, 0.5)

    nbinscut = int(np.round (theta2_cut[k] / ((theta_range[1] - theta_range[0]) / nbins[k])))


    fig = plt.figure(figsize=(16,6))

    counts_on, bins  = np.histogram(theta2_on, bins=nbins[k], range=theta_range)
    counts_off, _ = np.histogram(theta2_off_180, bins=bins)

    if number_of_offs[k] == 3:
        counts_off_90, _ = np.histogram(theta2_off_90, bins=bins)
        counts_off_270, _ = np.histogram(theta2_off_270, bins=bins)
        counts_off += counts_off_90 + counts_off_270

    alpha = 1/number_of_offs[k]
        
    fig.add_subplot(1, 2, 1)
    plt.errorbar(0.5*(bins[1:]+bins[:-1]), counts_on, yerr=counts_on**0.5, 
                 fmt='o', ms=3, label='ON-source')
    plt.errorbar(0.5*(bins[1:]+bins[:-1]), alpha*counts_off, yerr=alpha*(counts_off**0.5), 
                 fmt='o', ms=3, label='OFF-source')

    plt.plot([theta2_cut[k], theta2_cut[k]], [0, counts_on.max()], linestyle='dashed', 
             color='tab:green', label='$\\theta^2$ cut')

    plt.xlabel('$\\theta^2 (deg^2)$', fontsize=14)
    plt.ylabel('Events', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.ylim(0, counts_on.max()*1.15)
    plt.legend(fontsize=14)
    plt.grid()


    excess = counts_on - alpha*counts_off
    err = (counts_on + alpha*alpha*counts_off)**0.5


    fig.add_subplot(1, 2, 2)

    plt.errorbar(0.5*(bins[1:]+bins[:-1]), excess, yerr=err, fmt='o', ms=3)
    plt.plot([theta2_cut[k], theta2_cut[k]], [0, excess.max()], linestyle='dashed', 
             color='tab:green', label='$\\theta^2$ cut')

    plt.xlabel('$\\theta^2 (deg^2)$', fontsize=14)
    plt.ylabel('Excess', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid()


    plt.show()

    non = counts_on[:nbinscut].sum()
    noff = counts_off[:nbinscut].sum()

    print(f'Energy range: {min_energy_cut[k]:.1f} - {max_energy_cut[k]:.1f} TeV')
    print(f'Excess: {non-alpha*noff:.3f}; Off: {noff}')
    print(f'Gamma rate: {(non-alpha*noff)/t_eff.to_value(u.min):.3f} events / minute')
    print(f'Off rate: {noff/t_eff.to_value(u.min):.3f} events / minute')
    print(f'alpha (backg normalization): {alpha:.3f}')

    from pyirf.statistics import li_ma_significance
    print(f'Li & Ma Significance: {li_ma_significance(non, noff, alpha):.2f} standard deviations')
    
    # If source is Crab, check what significance one would get in 50 h for a weak source:

    if source_name.find("Crab") >= 0:
        non_50h = 50. / t_eff.to_value(u.h) * non
        noff_50h = 50. / t_eff.to_value(u.h) * noff
        fraction = 0.01
        print()
        print(f'Li & Ma Significance for {int(fraction*100)}% of excess in 50 h (useful if source is Crab): '
              f'{li_ma_significance((non_50h-alpha*noff_50h)*fraction+alpha*noff_50h, noff_50h, alpha):.2f} '
              'standard deviations')


## For good-quality, low-zenith Crab runs the high-E significance for 1% of Crab in 50 h should be around 5 sigma 