# Feature extraction

In [16]:
# Imports
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import sys
import matplotlib.pyplot as plt
from master_data_functions.functions import import_data,save_feature_representation,load_feature_representation, event_indices
from master_models.pretrained import pretrained_model
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Import data

# File import
# Sample filenames are:
# CeBr10kSingle_1.txt -> single events, 
# CeBr10kSingle_2.txt -> single events
# CeBr10k_1.txt -> mixed single and double events 
# CeBr10.txt -> small file of 10 samples
# CeBr2Mil_Mix.txt -> 2 million mixed samples of simulated events

# Flag import, since we can now import 200k events from .npy files
from_file = False
if from_file:

    folder = "simulated"
    filename = "CeBr2Mil_Mix.txt"
    num_samples = 2e5
    #folder = "sample"
    #filename = "CeBr10k_1.txt"
    #num_samples = 1e3

    data = import_data(folder=folder, filename=filename, num_samples=num_samples)
    images = data[filename]["images"]
    energies = data[filename]["energies"]
    positions = data[filename]["positions"]
    labels = to_categorical(data[filename]["labels"])
    n_classes = labels.shape[1]
else:
    images = load_feature_representation("images_200k.npy")
    energies = load_feature_representation("energies_200k.npy")
    positions = load_feature_representation("positions_200k.npy")
    labels = load_feature_representation("labels_200k.npy")

n_classes = labels.shape[1]
print("Number of classes: {}".format(n_classes))
print("Input Images shape: {}".format(images.shape))
print("Energies shape: {}".format(energies.shape))
print("Positions shape: {}".format(positions.shape))
print("Labels shape: {}".format(labels.shape))

# VGG16 expects 3 channels. Solving this by concatenating the image data 
# to itself, to form three identical channels

images = np.concatenate((images, images, images), axis=3)
print("Reshaped Images data shape: {}".format(images.shape))

Number of classes: 2
Input Images shape: (200000, 16, 16, 1)
Energies shape: (200000, 2)
Positions shape: (200000, 4)
Labels shape: (200000, 2)
Reshaped Images data shape: (200000, 16, 16, 3)


## Save feature representations for all models

In [8]:
# Keys: model names, Values: depth to compare at.
pretrained_models = {
    "DenseNet121":None, #8
    "DenseNet169":None, #8
    "DenseNet201":None, #8
    "InceptionResNetV2":None, #8
    "InceptionV3":None, #8
    "MobileNet":None, #8
    "MobileNetV2":None, #5
    "NASNetLarge":None, #4
    "NASNetMobile":None, #4
    "ResNet50":None, #8
    "VGG16":None,
    "VGG19":None,
    "Xception":None, #6
    }

## Save feature representations

In [None]:

for net, depth in pretrained_models.items():
    print("Running for:", net)
    # Build net at desired depth
    pretrained = pretrained_model(which_model=net, output_depth=depth)
    
    # Extract features and split them into single and double
    pretrained_features = pretrained.predict(images)
    
    if depth is None:
        depth = "full"
    features_filename = net + "_d" + str(depth) + "_" + str(pretrained_features.shape[0]) + "npy"
    save_feature_representation(pretrained_features, features_filename)
    
    # Delete to free memory for next iteration just in case
    del pretrained_features



## Test feature distribution

In [9]:
# Define Kolmogorov-Smirnov test
from scipy.stats import ks_2samp
from joblib import Parallel, delayed
# Check difference using Kolmogorov-Smirnov

def get_pval(i):
    ks = ks_2samp(single_features[:,i], double_features[:,i])
    return ks.pvalue


### Kolmogorov-Smirnov 2-sample test for all networks

In [12]:
# Run test on all pretrained nets.
p_output = {}


for net, depth in pretrained_models.items():
    print("Running for:", net)
    
    # Load features
    if depth is None:
        depth = "full"
    features_filename = net + "_d" + str(depth) + "_" + str(labels.shape[0]) + ".npy"
    pretrained_features = load_feature_representation(features_filename)
    single_features = pretrained_features[np.where(labels[:,0] == 1)]
    double_features = pretrained_features[np.where(labels[:,1] == 1)]
    n = pretrained_features.shape[1]
    p_values = Parallel(n_jobs=-1, verbose=2)(delayed(get_pval)(i) for i in range(n))
    p_output[net] = p_values
    
    #plt.close()
    #plt.plot(range(len(p_values)), p_values, label=net)
    #plt.legend()
    #plt.savefig(net + "-p-vals.png")
    
    # Delete allocated arrays for saving memory in notebook
    del pretrained_features
    del single_features

Running for: DenseNet121


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 497 out of 512 | elapsed:    5.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    5.3s finished


Running for: DenseNet169


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 429 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 497 out of 512 | elapsed:    3.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    3.9s finished


Running for: DenseNet201


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 251 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 497 out of 512 | elapsed:    4.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    4.5s finished


Running for: InceptionResNetV2


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    2.1s finished


Running for: InceptionV3


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    2.0s finished


Running for: MobileNet


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 2048 out of 2048 | elapsed:   53.6s finished


Running for: MobileNetV2


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 369 out of 384 | elapsed:    2.8s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:    2.9s finished


Running for: NASNetLarge


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done 2058 out of 2058 | elapsed:   40.7s finished


Running for: NASNetMobile


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 524 out of 539 | elapsed:    3.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 539 out of 539 | elapsed:    3.8s finished


Running for: ResNet50


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   52.5s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4096 out of 4096 | elapsed:  2.4min finished


Running for: VGG16


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 497 out of 512 | elapsed:    4.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    4.6s finished


Running for: VGG19


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 435 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    3.5s finished


Running for: Xception


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 3200 out of 3200 | elapsed:  1.8min finished


### Output table of KS 2-sample test results

In [13]:
# Get number of p-values below thresholds for each net to check
# if it's reasonable to reject the null-hypothesis (no difference in distributions)
ks_statistics = []
for key, val in p_output.items():
    pvals = np.array(p_output[key])
    n_features = len(pvals)
    n_below_1 = len(np.where(pvals < 0.01)[0])
    n_below_05 = len(np.where(pvals < 0.005)[0])
    n_below_01 = len(np.where(pvals < 0.001)[0])
    ks_statistics.append(
        [key, 
         n_features, 
         n_below_1/n_features,
         n_below_05/n_features,
         n_below_01/n_features,
        ]
    )


In [15]:
from tabulate import tabulate
# Output as latex table
headers = ["Network", "num_features", "ratio p < 0.01", "ratio p < 0.005", "ratio p < 0.001"]
print(tabulate(ks_statistics, headers, tablefmt="latex"))

\begin{tabular}{lrrrr}
\hline
 Network           &   num\_features &   ratio p \ensuremath{<} 0.01 &   ratio p \ensuremath{<} 0.005 &   ratio p \ensuremath{<} 0.001 \\
\hline
 DenseNet121       &            512 &         1        &          1        &          1        \\
 DenseNet169       &            512 &         1        &          1        &          1        \\
 DenseNet201       &            512 &         1        &          1        &          1        \\
 InceptionResNetV2 &            320 &         0.96875  &          0.96875  &          0.965625 \\
 InceptionV3       &            320 &         0.98125  &          0.98125  &          0.98125  \\
 MobileNet         &           2048 &         0.227051 &          0.227051 &          0.227051 \\
 MobileNetV2       &            384 &         1        &          1        &          1        \\
 NASNetLarge       &           2058 &         0.510204 &          0.510204 &          0.510204 \\
 NASNetMobile      &            539 &    

#### Compare single events and close double events
'Close' double events are events separated by a distance less than 3 mm.
This length is chose because that is the width of one pixel in the image data, and
it is below this distance that the models seem to struggle the most.

In [17]:
# Run test on all pretrained nets.
p_output = {}


for net, depth in pretrained_models.items():
    print("Running for:", net)
    
    # Load features
    if depth is None:
        depth = "full"
    features_filename = net + "_d" + str(depth) + "_" + str(images.shape[0]) + ".npy"
    pretrained_features = load_feature_representation(features_filename)
    single_features = pretrained_features[np.where(labels[:,0] == 1)]
    single_indices, double_indices, close_indices = event_indices(positions)
    double_features = pretrained_features[close_indices]
    n = pretrained_features.shape[1]
    p_values = Parallel(n_jobs=-1, verbose=2)(delayed(get_pval)(i) for i in range(n))
    p_output[net] = p_values
    
    #plt.close()
    #plt.plot(range(len(p_values)), p_values, label=net)
    #plt.legend()
    #plt.savefig(net + "_close" + "-p-vals.png")
    
    # Delete allocated arrays for saving memory in notebook
    del pretrained_features
    del single_features
    



Running for: DenseNet121


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    2.9s finished


Running for: DenseNet169


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    1.8s finished


Running for: DenseNet201


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    2.1s finished


Running for: InceptionResNetV2


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.4s finished


Running for: InceptionV3


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.3s finished


Running for: MobileNet


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 423 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1235 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 2048 out of 2048 | elapsed:   10.3s finished


Running for: MobileNetV2


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:    1.4s finished


Running for: NASNetLarge


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 423 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 1235 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 2043 out of 2058 | elapsed:   10.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2058 out of 2058 | elapsed:   10.6s finished


Running for: NASNetMobile


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 539 out of 539 | elapsed:    1.8s finished


Running for: ResNet50


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 4096 out of 4096 | elapsed:  1.3min finished


Running for: VGG16


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    2.2s finished


Running for: VGG19


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 512 out of 512 | elapsed:    1.8s finished


Running for: Xception


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done 3200 out of 3200 | elapsed:   59.2s finished


In [18]:
# Get number of p-values below thresholds for each net to check
# if it's reasonable to reject the null-hypothesis (no difference in distributions)
ks_statistics = []
for key, val in p_output.items():
    pvals = np.array(p_output[key])
    n_features = len(pvals)
    n_below_1 = len(np.where(pvals < 0.01)[0])
    n_below_05 = len(np.where(pvals < 0.005)[0])
    n_below_01 = len(np.where(pvals < 0.001)[0])
    ks_statistics.append(
        [key, 
         n_features, 
         n_below_1/n_features,
         n_below_05/n_features,
         n_below_01/n_features,
        ]
    )

In [19]:
# Output as latex table
print(len(close_indices))
headers = ["Network", "num_features", "ratio p < 0.01", "ratio p < 0.005", "ratio p < 0.001"]
print(tabulate(ks_statistics, headers, tablefmt="latex"))

11006
\begin{tabular}{lrrrr}
\hline
 Network           &   num\_features &   ratio p \ensuremath{<} 0.01 &   ratio p \ensuremath{<} 0.005 &   ratio p \ensuremath{<} 0.001 \\
\hline
 DenseNet121       &            512 &         1        &          1        &          1        \\
 DenseNet169       &            512 &         0.998047 &          0.998047 &          0.998047 \\
 DenseNet201       &            512 &         1        &          1        &          1        \\
 InceptionResNetV2 &            320 &         0.96875  &          0.96875  &          0.965625 \\
 InceptionV3       &            320 &         0.971875 &          0.96875  &          0.9625   \\
 MobileNet         &           2048 &         0.22168  &          0.221191 &          0.21875  \\
 MobileNetV2       &            384 &         1        &          1        &          1        \\
 NASNetLarge       &           2058 &         0.480564 &          0.474247 &          0.456754 \\
 NASNetMobile      &            539

In [20]:
print(len(single_indices))
print(len(double_indices))
print(len(close_indices))

100000
100000
11006


## (Optional) Plot features for some samples

In [None]:
manual_inspect = False
if manual_inspect:
    # Compare feature output for reference image with a single and double image
    plt.plot(range(len(reference_features[0])), reference_features[0], alpha=0.5, label='reference')
    plt.plot(range(len(single_features[0])), single_features[0], alpha=0.5, label='single')
    plt.plot(range(len(double_features[0])), double_features[0], alpha=0.5, label='double')
    plt.legend()
    plt.show()
    
    # Check distribution of features by inspection
    index = 0 
    fig, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(12,12))
    for i in range(3):
        for j in range(3):
            # plot features
            ax[i, j].hist(single_features[:,index + i*3 + j], alpha=0.5, label='single')
            ax[i, j].hist(double_features[:,index + i*3 + j], alpha=0.5, label='double')
            ax[i, j].hist(ref_vgg_features, alpha=0.5, label='reference')
            ax[i, j].legend()
    plt.show()