In [None]:
import numpy as np

from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
import torch
from torch import optim
from torch.utils.data import DataLoader

import typing

import anomalydetector
from anomalydetector.models.VAE import NFVAE, VAE
from anomalydetector.models.utils import get_latent_dists, summarise_model
from anomalydetector.models.NICE import NICEModel
from anomalydetector.processing import InMemoryND280EventDataset, nd280EventDataset
from anomalydetector.plotting import make_corner_plot

if torch.cuda.is_available():
    print("Found cuda device, will use GPU")
else:
    print("No GPU :(")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Setting Up the Training Data

Here we set up two datasets. One is our training data, for which we will take non-src events that have a reconstructed proton.
The other will be our test out of distribution dummy test dataset, for which we will take true src events with a reconstructed proton.

In [None]:
train_ds = InMemoryND280EventDataset(
    #root="/home/hep/ewmiller/anomaly-detection/processed_files/MC/ID",
    filenames=[
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run2air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run2water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run3air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run4air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run4water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run5water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run6air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run7water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run8air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run8water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run9water_v1.root'
    ],
    branches=["Pmu", "RecoLepDirX", "RecoLepDirY", "RecoLepDirZ", "RecoProtonMom", "RecoProtonDirX", "RecoProtonDirY", "RecoProtonDirZ"],
    branch_scaling=np.array([0.2e-3, 1.0, 1.0, 1.0, 0.2e-3, 1.0, 1.0, 1.0], dtype=np.float32),
    branch_mask_vals=np.array([-999.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0]),
    branch_mask_replace_vals=np.array([0.0, -2.0, -2.0, -2.0, 0.0, -2.0, -2.0, -2.0], dtype=np.float32),
    filter="(isSRC!=1) & (RecoProtonMom!=-999.0)" #"q0<2000.0"
)

ood_ds = InMemoryND280EventDataset(
    #root="/home/hep/ewmiller/anomaly-detection/processed_files/MC/OOD",
    filenames=[
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run2air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run2water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run3air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run4air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run4water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run5water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run6air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run7water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run8air_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run8water_v1.root',
        '/vols/t2k/nd280-OA2022/FDS-inputs/FDS_run9water_v1.root'
    ],
    branches=["Pmu", "RecoLepDirX", "RecoLepDirY", "RecoLepDirZ", "RecoProtonMom", "RecoProtonDirX", "RecoProtonDirY", "RecoProtonDirZ"],
    branch_scaling=np.array([0.2e-3, 1.0, 1.0, 1.0, 0.2e-3, 1.0, 1.0, 1.0], dtype=np.float32),
    branch_mask_vals=np.array([-999.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0, -999.0]),
    branch_mask_replace_vals=np.array([0.0, -2.0, -2.0, -2.0, 0.0, -2.0, -2.0, -2.0], dtype=np.float32),
    filter="(isSRC==1) & (RecoProtonMom!=-999.0)" #"q0>= 2000.0"
)

train_ds.dump_branches()

train_ds.process()
ood_ds.process()

Lets print some info about our datasets and make plots of the distributions

In [None]:
print(f'Number of training examples: {len(train_ds)}')
print(f'Number of OOD examples:      {len(ood_ds)}')

make_corner_plot(train_ds.get_data().numpy(), ood_ds.get_data().numpy(), 
    ranges = [(-0.2, 1.5), (-1.1,1.1),    (-1.1,1.1),    (-1.1,1.1),    (-0.2, 1.5), (-1.1,1.1),  (-1.1, 1.1), (0.0, 1.1)],
    titles = ["p mu",      "lep dir [x]", "lep dir [y]", "lep dir [z]", "p p",       "p dir [x]", "p dir[y]",  "p dir[z]"],
).show()

## Set up the Model

Now we create our model. Feel free to comment / uncomment and play around with parameters.

In [None]:
n_features = train_ds.get_n_features()

######################
## set up model ##
######################

# model = VAE(6, 2) 

# model = NFVAE(
#     n_bottleneck = 2,
#     hidden_units_encoder = [n_features, 8, 4, n_bottleneck*2],
#     hidden_units_decoder = [n_bottleneck, 4, 8, n_features],
#     n_flows = 0,
#     flow_type = "Planar",
#     device = device,
# )

model = NICEModel(
    n_features = n_features, 
    n_flows = 5, 
    n_hidden = [64, 128, 256, 128, 64]
)

model.to(device)

model.compile()
model.train()

summarise_model(model)

## Train the Model

In [None]:
n_epochs = 100

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

## make loaders for the two datasets
train_loader:DataLoader = DataLoader(train_ds, batch_size=100000, shuffle=True)
ood_loader:DataLoader   = DataLoader(ood_ds,   batch_size=100000, shuffle=True)

epoch_progressbar = tqdm(range(n_epochs), total = n_epochs, desc="epoch")
for epoch_n in epoch_progressbar: 
    
    epoch_loss = 0.0
    n_batches  = 0

    batch_progressbar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    for batch_n, (x, n) in batch_progressbar:

        x = x.to(device)
        optimizer.zero_grad()
        loss = model.train_batch(x)
        optimizer.step()
        
        batch_progressbar.set_description(f"loss: {loss.item():.4f}")

        # update running mean loss
        epoch_loss += loss.item()
        n_batches += 1

    
    # make plot of the latent space
    id_encoded,  id_llh  = get_latent_dists(model, train_loader, device, quiet=True)
    ood_encoded, ood_llh = get_latent_dists(model, ood_loader,   device, quiet=True)

    fig = make_corner_plot(id_encoded, ood_encoded,
        ranges = [(-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5)],
        titles = None,
        n_bins = 40
    )

    fig.savefig(f"plots/latent_dist-epoch-{epoch_n:04}.png")
    fig.clear()
    plt.close(fig)
        
    epoch_loss /= n_batches    
    epoch_progressbar.set_description(f"epoch {epoch_n} loss: {epoch_loss:.4f}")

Now let's save the model so we can use it later or pick up the training where we left off.

In [None]:
torch.save(model.state_dict(), "NICE_src_detector_5-layer-64_128_256_128_64.pt")

Now let's see what the latent distributions and likelihoods look like post-training.

In [None]:

id_encoded,  id_llh  = get_latent_dists(model, train_loader, device)
ood_encoded, ood_llh = get_latent_dists(model, ood_loader,   device)

plt.hist([ood_llh[:,0], id_llh[:,0],], color = ["tab:orange", "tab:green"], bins=100, range=(-50, 5), histtype="step", label=["OOD", "ID"], fill=False)
plt.title("ID vs OOD LLH Distribution")
plt.yscale('log')
plt.legend()
plt.show()

plt.hist([ood_llh[:,0], id_llh[:,0]], color = ["tab:orange", "tab:green"], bins=50, range=(-50, 5), histtype='step', label=["OOD", "IID"], density=True, fill=False)
plt.title("ID vs OOD LLH Distribution - Normalised")
plt.xlabel("LLH")
plt.yscale('log')
plt.legend()
plt.show()

LLH_CUT = 5.0
far_outliers = ood_llh[:, 0] < LLH_CUT

make_corner_plot(id_encoded, ood_encoded,
    ranges = [(-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5), (-1.5, 1.5), (-1.5,1.5)],
    titles = None,
    n_bins = 40
).show()
