**Results - Regression on experimental events**

This notebook is the primary source of plots and tables for the regression part of the thesis, 
with the goal of keeping every table and figure as standardized as possible. (And who has the time to update
90 tables one by one anyway).

**Questions:**
* Descriptive statistics
    - Should descriptive statistics of the simulated data be included?\
    If so, how much? And should it be included for each fold in the k-fold cross-validation?
* Classification results
    - Breakdown of results based on event type? Single, double, close double?
    Reasonable to include in order to confirm the assumption that close doubles are the
    most difficult event type to classify correctly in simulated data
    Random state is included, so should be simple to reproduce the indices


**TODO**
* Implement reproducing the validation indices for each fold based on the random seed from config

**Handy links**
* [matplotlib-plots to latex](https://timodenk.com/blog/exporting-matplotlib-plots-to-latex/)
* [Robert's thesis df output](https://github.com/ATTPC/VAE-event-classification/blob/master/src/make_classification_table.py)

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from master_scripts.data_functions import *
from master_scripts.analysis_functions import *
import tensorflow as tf
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np

THESIS_PATH = "../../../master_thesis/"
section_path = "chapters/results/figures/"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Load experimental set and normalize
repo_root = get_git_root()

events, images = import_real_energy_data(repo_root + "data/real/decaydata_100k.txt") # images not normalized
images_non_normed = images.copy()
images = normalize_image_data(images)

In [5]:
def classify(model, images, events, threshold=0.5):
    # Get prediction and make class labels based on threshold of 0.5
    y_out = model.predict(images)
    y_pred = (y_out > threshold).astype(int)
    for event_id in events.keys():
        if y_pred[events[event_id]['image_idx']] == 0:
            events[event_id]['event_class'] = "single"
        else:
            events[event_id]['event_class'] = "double"

    return events
    
def classification_metrics(model, images, events, name, threshold=0.5):
    # Get prediction and make class labels based on threshold of 0.5
    y_out = model.predict(images)
    y_pred = (y_out > threshold).astype(int)
    for event_id in events.keys():
        if y_pred[events[event_id]['image_idx']] == 0:
            events[event_id]['event_class'] = "single"
        else:
            events[event_id]['event_class'] = "double"
            
    descriptors = list(set([event['event_descriptor'] for event in events.values()]))
    
    desc_class = {'single': [], 'double': []}
    for event in events.values():
        desc_class[event['event_class']].append(event['event_descriptor'])
    
    translate_descriptor = {
        1: "Implant",
        2: "Decay",
        3: "implant + Decay",
        4: "Light ion",
        5: "Implant + Light Ion",
        6: "Decay + Light Ion",
        7: "Implant + Decay + Light Ion",
        8: "Double (time)",
        9: "Implant + Double (time)",
        10: "Decay + Double (time)",
        11: "Implant + Decay + Double (time)",
        12: "Light ion + Double (time)",
        13: "Implant + Light Ion + Double (time)",
        14: "Decay + Light ion + Double (time)",
        15: "Implant + Decay + Light Ion + Double (time)",
        16: "Double (space)",
        17: "Implant + Double (space)",
        18: "Decay + Double (space)"
    }
    
    results = {}
    for d in descriptors:
        num_s = desc_class['single'].count(d)
        num_d = desc_class['double'].count(d)
        tot = num_s + num_d
        norm_s = num_s / tot
        norm_d = num_d / tot
        results[translate_descriptor[d]] = [
            num_s,
            num_d,
            norm_s,
            norm_d
        ]
    
    cols = ["Predicted single", "Predicted double", "Normalized single", "Normalized double"]
    df = pd.DataFrame.from_dict(data=results, orient='index', columns=cols)
    return df


def regression_position(model, events, images, name):
    """ Position regression on experimental data.
    """
    
    prediction = model.predict(images)   
    df = pd.DataFrame.from_dict(
        data={
            name + "_pos1": prediction[:, 0],
            name + "_pos2": prediction[:, 1],
        }
    ).rename(index={i: k for i, k in enumerate(events.keys())})
    return df

def regression_energy(model, events, images, name):
    """ Energy regression on experimental data.
    """
    
    prediction = model.predict(images)
    df = pd.DataFrame(
        data={name + "_e1": prediction.reshape(prediction.shape[0])},
        index=events.keys()
    )
    return df

# Pre-processed simulated data - no additional modifications
This is the basic metrics for all the models trained on simulated data.
The basic pre-processing includes formatting and min-max normalization.

## Single events

### Energy

#### Linear regression

In [6]:
# Load linear regression experiment
lin_ex_id = "87e8f4558d97"
lin_ex = load_experiment(lin_ex_id)

# Load model and predict
lin_model = tf.keras.models.load_model(repo_root + "models/" + lin_ex_id + ".h5", compile=False)
lin_test_energy = regression_energy(lin_model, events, images.reshape(images.shape[0], 256), "lin_test_energy")
del lin_model #No longer needed, clear memory just in case.

#### Small dense network

In [7]:
# Load dense regression experiment
dense_ex_id = "4cab676db128"
dense_ex = load_experiment(dense_ex_id)
# Load model and predict
dense_model = tf.keras.models.load_model(repo_root + "models/" + dense_ex_id + ".h5", compile=False)
dense_test_energy = regression_energy(dense_model, events, images.reshape(images.shape[0], 256), "dense_test_energy")
del dense_model

#### Small CNN

In [8]:
# Load cnn regression experiment
cnn_ex_id = "3a91fd0e74b5"
cnn_ex = load_experiment(cnn_ex_id)
# Load model and predict
cnn_model = tf.keras.models.load_model(repo_root + "models/" + cnn_ex_id + ".h5", compile=False)
cnn_test_energy = regression_energy(cnn_model, events, images, "cnn_test_energy")
del cnn_model

#### Pretrained - VGG16 

In [9]:
# Load logistic regression experiment
pretrained_ex_id = "ea8d88850f6e"
pretrained_ex = load_experiment(pretrained_ex_id)
# Load model and predict
pretrained_model = tf.keras.models.load_model(repo_root + "models/" + pretrained_ex_id + ".h5", compile=False)
pretrained_test_energy = regression_energy(pretrained_model, events, np.concatenate((images, images, images), axis=-1), "pretrained_test_energy")
del pretrained_model

#### Custom model

In [10]:
# Load custom regression experiment
custom_ex_id = "3d45e6694b1d"
custom_ex = load_experiment(custom_ex_id)
# Load model and predict
custom_model = tf.keras.models.load_model(repo_root + "models/" + custom_ex_id + ".h5", compile=False)
custom_test_energy = regression_energy(custom_model, events, images, "custom_test_energy")
del custom_model

#### Output

In [11]:
all_test_energy = pd.concat(
    [
        lin_test_energy,
        dense_test_energy,
        cnn_test_energy,
        pretrained_test_energy,
        custom_test_energy
    ],
    axis=1
)
display(all_test_energy)

Unnamed: 0,lin_test_energy_e1,dense_test_energy_e1,cnn_test_energy_e1,pretrained_test_energy_e1,custom_test_energy_e1
2,2.934569,2.869032,2.985354,1.124930,2.844858
32,0.254878,0.253270,0.260531,0.149089,0.297324
144,3.285792,3.204214,3.310057,1.280607,3.163367
170,1.701646,1.671118,1.724325,0.698573,1.688902
287,0.110196,0.109042,0.107094,0.083020,0.137669
...,...,...,...,...,...
14537044,0.526288,0.533481,0.510279,0.302651,0.547553
14537248,1.952837,1.918687,2.006057,0.776299,1.916296
14537421,3.277253,3.202767,3.295325,1.399536,3.213811
14537498,1.334511,1.316020,1.348431,0.705720,1.325303


# Pre-processed simulated data - Pixel modified
This is the basic metrics for all the models trained on simulated data.
The basic pre-processing includes formatting and min-max normalization.
Additionally, the data has had the top and bottom lines of pixels set to 0, plus
one pixel inside the detector permanently 0 (which idx again?).

## Single events

### Energy

#### Linear regression

In [12]:
# Load linear regression experiment
lin_ex_id = "7dfe302a7c09"
lin_ex = load_experiment(lin_ex_id)

# Load model and predict
lin_model = tf.keras.models.load_model(repo_root + "models/" + lin_ex_id + ".h5", compile=False)
lin_test_energy_pmod = regression_energy(lin_model, events, images.reshape(images.shape[0], 256), "lin_test_energy_pmod")
del lin_model

#### Small dense network

In [13]:
# Load dense regression experiment
dense_ex_id = "2dbd6c697bc5"
dense_ex = load_experiment(dense_ex_id)
# Load model and predict
dense_model = tf.keras.models.load_model(repo_root + "models/" + dense_ex_id + ".h5", compile=False)
dense_test_energy_pmod = regression_energy(dense_model, events, images.reshape(images.shape[0], 256), "dense_test_energy_pmod")
del dense_model

#### Small CNN
This is really sensitive to pixel modifications.
Performs similarly to the other models if you pixelmod the test data.

In [14]:
# Load cnn regression experiment
cnn_ex_id = "fb0685871cf3"
cnn_ex = load_experiment(cnn_ex_id)

cnn_model = tf.keras.models.load_model(repo_root + "models/" + cnn_ex_id + ".h5", compile=False)
cnn_test_energy_pmod = regression_energy(cnn_model, events, images, "cnn_test_energy_pmod")
del cnn_model

#### Pretrained - VGG16 

In [15]:
# Load logistic regression experiment
pretrained_ex_id = "8aa9f731b693"
pretrained_ex = load_experiment(pretrained_ex_id)
# Load model and predict
pretrained_model = tf.keras.models.load_model(repo_root + "models/" + pretrained_ex_id + ".h5", compile=False)
pretrained_test_energy_pmod = regression_energy(pretrained_model, events, np.concatenate((images, images, images), axis=-1), "pretrained_test_energy_pmod")
del pretrained_model

#### Custom model

In [16]:
# Load custom regression experiment
custom_ex_id = "02c59a04c095"
custom_ex = load_experiment(custom_ex_id)
# Load model and predict
custom_model = tf.keras.models.load_model(repo_root + "models/" + custom_ex_id + ".h5", compile=False)
custom_test_energy_pmod = regression_energy(custom_model, events, images, "custom_test_energy_pmod")
del custom_model

#### Output

In [17]:
all_test_energy_pmod = pd.concat(
    [
        lin_test_energy_pmod,
        dense_test_energy_pmod,
        cnn_test_energy_pmod,
        pretrained_test_energy_pmod,
        custom_test_energy_pmod
    ],
    axis=1
)
display(all_test_energy_pmod)

Unnamed: 0,lin_test_energy_pmod_e1,dense_test_energy_pmod_e1,cnn_test_energy_pmod_e1,pretrained_test_energy_pmod_e1,custom_test_energy_pmod_e1
2,2.680125,2.586785,2.692647,0.893881,2.460164
32,0.244124,0.239669,0.237576,0.178002,0.252183
144,2.993863,2.914809,3.020898,1.133986,2.290029
170,1.497414,1.479739,1.512188,0.675219,1.289584
287,0.097030,0.092487,0.094405,0.083272,0.138935
...,...,...,...,...,...
14537044,0.457909,0.455921,0.454422,0.218624,0.459567
14537248,1.748004,1.709343,1.741887,0.599465,1.552250
14537421,2.943064,2.862007,2.946378,1.145308,2.465071
14537498,1.194920,1.172714,1.203017,0.624059,1.122276


# Pre-processed simulated data - Pixel modified and imbalanced
This is the basic metrics for all the models trained on simulated data.
The basic pre-processing includes formatting and min-max normalization.
Additionally, the data has had the top and bottom lines of pixels set to 0, plus
one pixel inside the detector permanently 0 (which idx again?).

This dataset has also been purposefully imbalanced to mimic the properties of experimental data
where doubles in space are expected to be rare.

## Single events

### Energy

#### Linear regression

In [18]:
# Load linear regression experiment
lin_ex_id = "9f256a4990c0"
lin_ex = load_experiment(lin_ex_id)

# Load model and predict
lin_model = tf.keras.models.load_model(repo_root + "models/" + lin_ex_id + ".h5", compile=False)
lin_test_energy_imbalanced = regression_energy(lin_model, events, images.reshape(images.shape[0], 256), "lin_test_energy_imbalanced")
del lin_model

#### Small dense network

In [19]:
# Load dense regression experiment
dense_ex_id = "29b1f98a4879"
dense_ex = load_experiment(dense_ex_id)
# Load model and predict
dense_model = tf.keras.models.load_model(repo_root + "models/" + dense_ex_id + ".h5", compile=False)
dense_test_energy_imbalanced = regression_energy(dense_model, events, images.reshape(images.shape[0], 256), "dense_test_energy_imbalanced")
del dense_model

#### Small CNN

In [20]:
# Load cnn regression experiment
cnn_ex_id = "8422f85d6ff6"
cnn_ex = load_experiment(cnn_ex_id)
# Load model and predict
cnn_model = tf.keras.models.load_model(repo_root + "models/" + cnn_ex_id + ".h5", compile=False)
cnn_test_energy_imbalanced = regression_energy(cnn_model, events, images, "cnn_test_energy_imbalanced")
del cnn_model

#### Pretrained - VGG16 

In [21]:
# Load logistic regression experiment
pretrained_ex_id = "73de75db91e4"
pretrained_ex = load_experiment(pretrained_ex_id)
# Load model and predict
pretrained_model = tf.keras.models.load_model(repo_root + "models/" + pretrained_ex_id + ".h5", compile=False)
pretrained_test_energy_imbalanced = regression_energy(pretrained_model, events, np.concatenate((images, images, images), axis=-1), "pretrained_test_energy_imbalanced")
del pretrained_model

#### Custom model

In [22]:
# Load custom regression experiment
custom_ex_id = "0071c04bef42"
custom_ex = load_experiment(custom_ex_id)
# Load model and predict
custom_model = tf.keras.models.load_model(repo_root + "models/" + custom_ex_id + ".h5", compile=False)
custom_test_energy_imbalanced = regression_energy(custom_model, events, images, "custom_test_energy_imbalanced")
del custom_model

#### Output

In [23]:
all_test_energy_imbalanced = pd.concat(
    [
        lin_test_energy_imbalanced,
        dense_test_energy_imbalanced,
        cnn_test_energy_imbalanced,
        pretrained_test_energy_imbalanced,
        custom_test_energy_imbalanced
    ],
    axis=1
)
display(all_test_energy_imbalanced)

Unnamed: 0,lin_test_energy_imbalanced_e1,dense_test_energy_imbalanced_e1,cnn_test_energy_imbalanced_e1,pretrained_test_energy_imbalanced_e1,custom_test_energy_imbalanced_e1
2,2.680125,2.583389,2.699759,0.893880,3.340349
32,0.244124,0.239231,0.244896,0.178002,0.372743
144,2.993863,2.914334,3.036983,1.133985,3.864175
170,1.497414,1.474914,1.511223,0.675218,2.145048
287,0.097030,0.092340,0.092328,0.083272,0.109037
...,...,...,...,...,...
14537044,0.457909,0.455190,0.466327,0.218623,0.405822
14537248,1.748005,1.708036,1.750977,0.599464,2.171010
14537421,2.943064,2.856131,2.959690,1.145307,3.594872
14537498,1.194920,1.172371,1.195298,0.624059,1.661558


# Combined tables

In [24]:
df_energy = pd.concat(
    [
        all_test_energy,
        all_test_energy_pmod,
        all_test_energy_imbalanced
    ],
    axis=1
)

df_energy.to_hdf("experimental_energy_100k.h5", "df", "w")
print(df_energy.shape)

(100000, 15)
