In [None]:
from pathlib import Path
import joblib
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt

from lstchain.visualization import plot_dl2
from lstchain.io.config import read_configuration_file

In [None]:
# plt.style.use('seaborn-paper')
import ctaplot
ctaplot.set_style('paper')

# 1. Produce data table

This section is runnable only by LST members on the cluster at La Palma.    
To reproduce the features importance plot, skip to section 2.

## Source indep data

In [None]:
rf_models = Path('/fefs/aswg/data/models/AllSky/20221027_v0.9.9_crab_tuned/dec_2276/')
list(Path(rf_models).iterdir())

### re-imlpement some functions to ease loading

In [None]:
models = {'energy':{},
          'disp_norm': {},
          'disp_sign': {},
          'class': {}
         }

config_file = list(rf_models.glob('*.json'))[0]
config = read_configuration_file(config_file)
models['energy']['features'] = config['energy_regression_features']
models['disp_norm']['features'] = config['disp_regression_features']
models['disp_sign']['features'] = config['disp_classification_features']
models['class']['features'] = config['particle_classification_features']

In [None]:
def features_names(models):
    return set(models['energy']['features'] + models['disp_norm']['features'] + models['disp_sign']['features'] + models['class']['features'])

In [None]:
def load_models(indir):
    indir = Path(indir)
    print(indir)
    print(f"disp: {indir.joinpath('reg_disp_norm.sav')}")
    disp_norm = joblib.load(indir.joinpath("reg_disp_norm.sav"))
    disp_sign = joblib.load(indir.joinpath("cls_disp_sign.sav"))
    print("class")
    clf = joblib.load(indir.joinpath("cls_gh.sav"))
    print("energy")
    energy = joblib.load(indir.joinpath("reg_energy.sav"))
    
    return energy, disp_norm, disp_sign, clf
    

In [None]:
energy, disp_norm, disp_sign, clf = load_models(rf_models)

In [None]:
models['energy']['model'] = energy
models['disp_norm']['model'] = disp_norm
models['disp_sign']['model'] = disp_sign
models['class']['model'] = clf


In [None]:
est = [tree.feature_importances_ for tree in models['energy']['model'].estimators_]
est = np.array(est)
data = pd.DataFrame(np.concatenate([np.transpose([[name for i  in range(150)], est[:,idx]]) for idx, name in enumerate(models['energy']['features'])]),
            columns=['feature', 'est'],
            )
                
    
data['est'] = data['est'].astype(float)

In [None]:
def extract_importances(model, feature_names):
    importances = model.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
    df = pd.DataFrame(np.transpose([feature_names, importances, std]), 
                      columns=['feature', 'importance', 'xerr'],
                     )
    df['importance'] = df['importance'].astype(float)
    df['xerr'] = df['xerr'].astype(float)
    return df

In [None]:
model_names = list(models.keys())
for name in model_names:
    print(name)
    d = models[name]
    d['importance'] =  extract_importances(models[name]['model'], models[name]['features'])
    d['importance'] = d['importance'].rename(columns={'importance': f'importance_{name}',
                            'xerr': f'xerr_{name}',
                           })

In [None]:
models['energy']['importance']

In [None]:
importances_df = deepcopy(models[model_names[0]]['importance'])
for name in model_names[1:]:
    importances_df = pd.merge(importances_df, models[name]['importance'],
                               on='feature',
                               how='outer',
                             )

In [None]:
sorted_imp = importances_df.sort_values(by='importance_class')
for name in model_names:
    print(name)
    sorted_imp[f'importance_{name}'] *= np.isfinite(sorted_imp[f'importance_{name}']).sum()/len(importances_df)
    sorted_imp[f'xerr_{name}'] *= np.isfinite(sorted_imp[f'importance_{name}']).sum()/len(importances_df)
    
    print(sorted_imp[f'importance_{name}'].sum())
    
    

In [None]:
sorted_imp

In [None]:
sorted_imp.to_csv('source_indep_models_features_importances.csv')

## Source-dep

In [None]:
# indir_src_dep=Path('/fefs/aswg/workspace/seiya.nozaki/Crab_performance_paper/20221027_v0.9.9_crab_tuned/std/RF/trained_models/')

indir_src_dep=Path('/fefs/aswg/workspace/seiya.nozaki/Crab_performance_paper/20221027_v0.9.9_crab_tuned/combined_off_axis_1deg/RF/trained_models/')
list(indir_src_dep.iterdir())

In [None]:
srcdep = {'energy':{},
          'disp_norm': {},
          'disp_sign': {},
          'class': {}
         }

# config_file = '/fefs/aswg/workspace/seiya.nozaki/Crab_performance_paper/20220518_allsky_dec2276_tuned/zd_all_with_pointing_info/RF/lstchain_src_dep_config.json'
config_file = indir_src_dep.joinpath('../lstchain_src_dep_config.json')
config = read_configuration_file(config_file)
srcdep['energy']['features'] = config['energy_regression_features']
srcdep['disp_norm']['features'] = config['disp_regression_features']
srcdep['disp_sign']['features'] = config['disp_classification_features']
srcdep['class']['features'] = config['particle_classification_features']

In [None]:
features_names = set(srcdep['energy']['features'] + srcdep['class']['features'])
features_names

In [None]:
energy_srcdep, disp_norm_srcdep, disp_sign_srcdep, class_srcdep = load_models(indir_src_dep)

In [None]:
srcdep['energy']['model'] = energy_srcdep
srcdep['disp_norm']['model'] = disp_norm_srcdep
srcdep['disp_sign']['model'] = disp_sign_srcdep
srcdep['class']['model'] = class_srcdep


In [None]:
model_names = ['energy', 'class']
for name in model_names:
    print(name)
    d = srcdep[name]
    d['importance'] =  extract_importances(d['model'], d['features'])
    d['importance'] = d['importance'].rename(columns={'importance': f'importance_{name}',
                            'xerr': f'xerr_{name}',
                           })

In [None]:
importances_srcdep_df = deepcopy(srcdep[model_names[0]]['importance'])
for name in model_names[1:]:
    importances_srcdep_df = pd.merge(importances_srcdep_df, srcdep[name]['importance'],
                               on='feature',
                               how='outer',
                             )

In [None]:
sorted_srcdep_imp = importances_srcdep_df.sort_values(by='importance_class')

for name in model_names:
    print(name)
    sorted_srcdep_imp[f'importance_{name}'] = sorted_srcdep_imp[f'importance_{name}']*np.isfinite(sorted_srcdep_imp[f'importance_{name}']).sum()/len(importances_srcdep_df)
    print(sorted_srcdep_imp[f'importance_{name}'].sum())

In [None]:
sorted_srcdep_imp.to_csv('source_dep_models_features_importances.csv')

# 2. Plots

In [None]:
def plot_features_importance(importance_df, model_names, ax=None, **kwargs):

    ax = plt.gca() if ax is None else ax
    x = np.arange(len(importance_df))

    kwargs.setdefault('error_kw', dict(lw=0.5))
    for i, name in enumerate(model_names):
        ax.barh(x+i*0.2,
                importance_df[f'importance_{name}'].values,
                0.18,
                left=0.02,
                label=name,
                xerr=importance_df[f'xerr_{name}'],
                **kwargs, 
               )
    ax.legend()
    ax.set_yticks(x+0.32)
    ax.set_yticklabels(importance_df['feature']);

    return ax

In [None]:
sorted_imp = pd.read_csv('source_indep_models_features_importances.csv')
# model_names = ['energy', 'disp_norm', 'disp_sign', 'class']
model_names = ['class', 'energy', 'disp_norm', 'disp_sign']

In [None]:
sorted_imp

In [None]:
tg_idx = sorted_imp[sorted_imp['feature'] == 'time_gradient'].index
stg_idx = sorted_imp[sorted_imp['feature'] == 'signed_time_gradient'].index

cols = ('importance_class', 'xerr_class')
sorted_imp.loc[tg_idx, cols] = sorted_imp.loc[stg_idx, cols].values

In [None]:
sk_idx = sorted_imp[sorted_imp['feature'] == 'skewness'].index
ssk_idx = sorted_imp[sorted_imp['feature'] == 'signed_skewness'].index

cols = ('importance_class', 'xerr_class')
sorted_imp.loc[sk_idx, cols] = sorted_imp.loc[ssk_idx, cols].values

In [None]:
sorted_imp = sorted_imp.drop(index=stg_idx)
sorted_imp = sorted_imp.drop(index=ssk_idx).sort_values(by='importance_class')
sorted_imp

In [None]:
plot_features_importance(sorted_imp, model_names)
plt.tight_layout()
plt.legend(loc='lower right')
plt.savefig('models_feature_importance_src_indep.png', dpi=250)
plt.show()

### Source-dep plots

In [None]:
sorted_srcdep_imp = pd.read_csv('source_dep_models_features_importances.csv')
model_names = ['class', 'energy']

In [None]:
# getting the color cycle to apply the same color to class as in source-indep plot
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
colors

In [None]:
colors_srcdep = [colors[0], colors[3]]

In [None]:
plot_features_importance(sorted_srcdep_imp, model_names)
plt.tight_layout()
plt.savefig('models_feature_importance_srcdep.png', dpi=250)
plt.show()