# True Shower Regression Baseline

Create a baseline using a simple ML algorithm to predict/infer the true shower parameters using a simple ML algorithm
- using clean_image_\*\_m1 and clean_image_\*\_m2 independently
- by combining both clean_image_* features with "hillas" and/or "stereo"

Linear regression, decision tree regression and random forrest regression will be used.
Polynomial regression will not be used, because it takes too long on the CPU and takes up too much space on the GPU.

In [4]:
import pandas as pd
import torch
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import numpy as np
import pyarrow.parquet as pq

import math
import sys
import os
import importlib

sys.path.append("../../../magic-ml-images")
from magicdl import magic

sys.path.append("../../..")

SEED = 42
gen = torch.Generator().manual_seed(SEED)

### Load and prepare data

In [None]:
import src.common as c
from src.common import datasets
importlib.reload(c)
gammas = datasets.read_gammas()
protons = datasets.read_protons()

In [20]:
from src.common import preprocessing
from src.common import PARAMS_HILLAS, PARAMS_TRUE_SHOWER, PARAMS_STEREO, PARAMS_CLEAN_IMAGE_M1, PARAMS_CLEAN_IMAGE_M2

protons_train, protons_val, protons_test = preprocessing.preprocess(
    protons,
    normalize_params=PARAMS_HILLAS + PARAMS_TRUE_SHOWER + PARAMS_STEREO + PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2,
    train_portion=0.6,
    validation_portion=0.2,
)

gammas_train, gammas_val, gammas_test = preprocessing.preprocess(
    gammas,
    normalize_params=PARAMS_HILLAS + PARAMS_TRUE_SHOWER + PARAMS_STEREO + PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2,
    train_portion=0.6,
    validation_portion=0.2,
)

import models


In [22]:
import linear_regression
importlib.reload(linear_regression)
from linear_regression import LinearRegression

import decision_tree_regression
importlib.reload(decision_tree_regression)
from decision_tree_regression import DecisionTreeRegression

import random_forest_regression
importlib.reload(random_forest_regression)
from random_forest_regression import RandomForestRegression

### Functions for establishing the baseline:

In [23]:
def mse(pred, actual):
    if type(pred) == torch.Tensor:
        p = pred.cpu().numpy()
    else:
        p = pred
    if type(actual) == torch.Tensor:
        a = actual.cpu().numpy()
    else:
        a = actual
    return np.mean((p - a)**2)

def r_sq(pred, actual):
    if type(pred) == torch.Tensor:
        p = pred.cpu().numpy()
    else:
        p = pred
    if type(actual) == torch.Tensor:
        a = actual.cpu().numpy()
    else:
        a = actual
    ss_res = np.sum((p - a)**2)
    ss_tot = np.sum((a - np.mean(a, axis=0))**2)
    return 1 - (ss_res / ss_tot)

In [24]:
def optimise(x_train, y_train, x_val, y_val, epsilon=2e-3, verbose=False):

    # Hyperparameter-Bereiche
    max_depths = [3, 5, 10, 20, 40, 75]
    n_trees_values = [50, 100, 200, 400, 800, 1500]
    
    best_params = {}
    
    model = DecisionTreeRegression()
    if verbose:
        print("Optimising DecisionTree...")
    best_r2 = -float('inf')
    best_depth = None
    for depth in max_depths:
        if verbose:
            print(f"Testing depth {depth}")
        model.set_params(max_depth=depth)
        model.fit(x_train, y_train)
        r2 = r_sq(model.predict(x_val), y_val)
        if r2 > best_r2:
            best_r2, best_depth = r2, depth
    best_params["DecisionTreeRegression"] = best_params["DecisionTreesRegression"] = {"max_depth": best_depth}
    
    model = RandomForestRegression()
    if verbose:
        print("Optimising RandomForest...")
    best_r2 = -float('inf')
    best_depth, best_n_trees = None, None
    for depth in max_depths:
        current_iter_best_r2 = -float('inf')
        current_iter_best_n_trees = None
        for n_trees in n_trees_values:
            if verbose:
                print(f"Testing depth {depth} and n_trees {n_trees}")
            model.set_params(max_depth=depth, n_trees=n_trees)
            model.fit(x_train, y_train)
            r2 = r_sq(model.predict(x_val), y_val)
            if verbose:
                print(f"R^2 Score: {r2}")
            if r2 - current_iter_best_r2 > epsilon:
                print(f"Improvement: {r2-best_r2}")
                current_iter_best_r2 = r2
                current_iter_best_n_trees = n_trees
            else:
                break 
        if current_iter_best_r2 > best_r2:
                best_r2 = current_iter_best_r2
                best_depth, best_n_trees = depth, current_iter_best_n_trees

    best_params["RandomForestRegression"] = best_params["RandomForestsRegression"] = {"max_depth": best_depth, "n_trees": best_n_trees}

    return best_params


_(Anmerkung: $\alpha$-Parameter hier weggelassen, da bei $\alpha = 0$ die besten Ergebnisse erzielt wurden)_

In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

def compare_models(x_train, y_train, x_val, y_val, x_test, y_test, hyperparams, final=False, verbose=True):
    models = {
        "LinearRegression": LinearRegression(len(x_train[0])),
        "DecisionTreeRegression": DecisionTreeRegression(),
        "RandomForestRegression": RandomForestRegression(),
    }
    
    # Auswahl der Evaluationsdaten
    x_eval, y_eval = (x_test, y_test) if final else (x_val, y_val)
    
    results = {}
    
    for name, model in models.items():
        if verbose:
            print(f"Training {name}...")
    
        if "DecisionTree" in name or "RandomForest" in name:
            model.set_params(**(hyperparams[name]))
        
        model.fit(x_train, y_train)
        y_train_pred, y_eval_pred = model.predict(x_train), model.predict(x_eval)
        results[name] = {
            "MSE Train": mse(y_train_pred, y_train),
            "MSE Eval": mse(y_eval_pred, y_eval),
            "R^2 Train": r_sq(y_train_pred, y_train),
            "R^2 Eval": r_sq(y_eval_pred, y_eval)
        }
    
    df_results = pd.DataFrame(results).T
    display(df_results)
    if verbose:
        print("Vergleich abgeschlossen.")


## Using only cleaned image m1
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)

In [None]:
# best_params = optimise(X_train, y_train, X_val, y_val, verbose=True)
# with open("./params.txt", "w") as f:
#     f.write(str(best_params))

best_params = {
    "DecisionTreeRegression": {"max_depth": 5},
    "RandomForestRegression": {"max_depth": 40, "n_trees": 800}
}


In [35]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.9767,0.981219,0.0233,0.016937
DecisionTreeRegression,0.968812,0.977295,0.031188,0.020868
RandomForestRegression,0.52197,0.938671,0.47803,0.059565


Vergleich abgeschlossen.


### Gammas


In [None]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M1].values, dtype=torch.float32)

In [None]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.945367,0.949454,0.054633,0.050092
DecisionTreeRegression,0.958387,0.96105,0.041613,0.038491
RandomForestRegression,0.451809,0.858126,0.548191,0.141464


Vergleich abgeschlossen.


: 

## Using only cleaned image m2
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)

In [16]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.977307,0.980731,0.022693,0.017426
DecisionTreeRegression,0.968114,0.979066,0.031886,0.019094
RandomForestRegression,0.51703,0.939094,0.48297,0.059141


Vergleich abgeschlossen.


### Gammas

In [None]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)

In [19]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.944741,0.947654,0.055259,0.051893
DecisionTreeRegression,0.955784,0.958725,0.044216,0.040817
RandomForestRegression,0.458797,0.857775,0.541203,0.141815


Vergleich abgeschlossen.


## Using both cleaned images
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)

In [29]:
# best_params = optimise(X_train, y_train, X_val, y_val, verbose=True)
# with open("./params.txt", "w") as f:
#     f.write(str(best_params))

best_params = {'DecisionTreeRegression': {'max_depth': 5}, 'DecisionTreesRegression': {'max_depth': 5}, 'RandomForestRegression': {'max_depth': 40, 'n_trees': 400}, 'RandomForestsRegression': {'max_depth': 40, 'n_trees': 400}}

In [13]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.965751,0.977358,0.034249,0.020805
DecisionTreeRegression,0.96634,0.978483,0.03366,0.019678
RandomForestRegression,0.501622,0.881047,0.498378,0.117297


Vergleich abgeschlossen.


### Gammas

In [None]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2].values, dtype=torch.float32)

In [15]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.928915,0.937461,0.071085,0.062091
DecisionTreeRegression,0.954031,0.956765,0.045969,0.042778
RandomForestRegression,0.307876,0.724883,0.692124,0.274771


Vergleich abgeschlossen.


## Using both cleaned images and hillas
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)

In [26]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.888752,0.896797,0.111248,0.101518
DecisionTreeRegression,0.920699,0.924327,0.079301,0.073936
RandomForestRegression,0.142819,0.833822,0.857181,0.164611


Vergleich abgeschlossen.


### Gammas


In [None]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)

In [28]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.788777,0.796603,0.211223,0.203016
DecisionTreeRegression,0.838014,0.838855,0.161986,0.160744
RandomForestRegression,0.083956,0.611935,0.916044,0.387773


Vergleich abgeschlossen.


## Using both cleaned images and stereo parameters
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)

In [30]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.608647,0.620874,0.391353,0.377959
DecisionTreeRegression,0.531089,0.536102,0.468911,0.462891
RandomForestRegression,0.058658,0.430175,0.941342,0.569016


Vergleich abgeschlossen.


### Gammas

In [None]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO].values, dtype=torch.float32)

In [32]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.540442,0.548472,0.459558,0.451265
DecisionTreeRegression,0.484432,0.485547,0.515568,0.514221
RandomForestRegression,0.035492,0.260787,0.964508,0.739089


Vergleich abgeschlossen.


## Using both cleaned images, hillas and stereo parameters
### Protons

In [None]:
y_train = torch.tensor(protons_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(protons_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(protons_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(protons_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)
X_val = torch.tensor(protons_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)
X_test = torch.tensor(protons_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)

In [38]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.571968,0.579603,0.428032,0.419307
DecisionTreeRegression,0.527894,0.532212,0.472106,0.466788
RandomForestRegression,0.05256,0.387792,0.94744,0.61148


Vergleich abgeschlossen.


### Gammas

In [27]:
y_train = torch.tensor(gammas_train[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_val = torch.tensor(gammas_val[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)
y_test = torch.tensor(gammas_test[PARAMS_TRUE_SHOWER].values, dtype=torch.float32)

X_train = torch.tensor(gammas_train[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)
X_val = torch.tensor(gammas_val[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)
X_test = torch.tensor(gammas_test[PARAMS_CLEAN_IMAGE_M1 + PARAMS_CLEAN_IMAGE_M2 + PARAMS_STEREO + PARAMS_HILLAS].values, dtype=torch.float32)

In [30]:
compare_models(X_train, y_train, X_val, y_val, X_test, y_test, best_params)

Training LinearRegression...
Training DecisionTreeRegression...
Training RandomForestRegression...


Unnamed: 0,MSE Train,MSE Eval,R^2 Train,R^2 Eval
LinearRegression,0.454845,0.460512,0.545155,0.539038
DecisionTreeRegression,0.466733,0.463519,0.533268,0.536028
RandomForestRegression,0.029058,0.213009,0.970942,0.786783


Vergleich abgeschlossen.
