# Sparse Submanifold Autoencoders

### Run this notebook inside a directory that contains dlp_opendata_api folder

In [1]:
# Import Dependencies

import numpy as np

import sys
sys.path.append("dlp_opendata_api")
from osf.image_api import image_reader_3d
from osf.particle_api import *
from osf.cluster_api import *

from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sparseconvnet as scn
import glob
import os.path as osp
import numpy as np

Welcome to JupyROOT 6.14/04


In [2]:
#ls /gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10
cuda0 = torch.device('cuda:0')
use_cuda = torch.cuda.is_available()

### Check if CUDA is working (GPU)

In [3]:
print(use_cuda)

True


In [4]:
class ClusteringAEData(Dataset):
    """
    A customized data loader for clustering.
    """
    def __init__(self, root, numPixels=192, filenames=None):
        """
        Initialize Clustering Dataset

        Inputs:
            - root: root directory of dataset
            - preload: if preload dataset into memory.
        """
        self.filenames = []
        self.root = root
        self.numPixels = str(numPixels)
        
        if filenames:
            self.filenames = filenames
        else:
            self.filenames = [f for f in glob.glob(
                osp.join(root, '*.root')) if self.numPixels in f]
        self.filenames.sort()
        self.ireader = image_reader_3d(*self.filenames)
        self.len = self.ireader.entry_count()

    def __getitem__(self, index):
        """
        Get a sample from dataset.
        """
        voxel, energy, label = self.ireader.get_image(index)
        entry = (torch.LongTensor(voxel), torch.FloatTensor(energy).view(-1, 1))
        return entry, label

    def __len__(self):
        """
        Total number of sampels in dataset.
        """
        return self.len

In [5]:
def ae_collate(batch):
    """
    Custom collate_fn for Autoencoder.
    """
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return [data, target]

### Get Train, Dev, and Test Set

In [6]:
root = '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10' #replace with your own path to root folder. 
trainset = [root + '/dlprod_192px_0{}.root'.format(i) for i in range(8)]
devset = [root + '/dlprod_192px_0{}.root'.format(8)]
testset = [root + '/dlprod_192px_0{}.root'.format(9)]
print(trainset)
print(devset)
print(testset)
trainset = ClusteringAEData(root, 192, filenames=trainset)
devset = ClusteringAEData(root, 192, filenames=devset)
testset = ClusteringAEData(root, 192, filenames=testset)
print('Number of entries in training set: {}'.format(len(trainset)))
print('Number of entries in validation set: {}'.format(len(devset)))
print('Number of entries in test set: {}'.format(len(testset)))

['/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_00.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_01.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_02.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_03.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_04.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_05.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_06.root', '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_07.root']
['/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_08.root']
['/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10/dlprod_192px_09.root']
Number of entries in training set: 80000
Number of entries in validation set: 10000
Number of entries in test set: 10000


In [7]:
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=ae_collate, num_workers=1, pin_memory=False)
devloader = DataLoader(devset, batch_size=64, shuffle=True, collate_fn=ae_collate, num_workers=1, pin_memory=False)

In [17]:
x, y = trainset[0][0]
print("X.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))
labels = trainset[0][1]
print("labels.shape = {}".format(labels.shape))
print(np.unique(labels))

X.shape = torch.Size([3827, 3])
y.shape = torch.Size([3827, 1])
labels.shape = (3827,)
[0. 2.]


Here, each row of 'coords' is the x,y,z coordinates of the active pixels of the 3D tensor, and each row of 'y' is the value of the pixel at the corresponding 'coords' coordinates. That is, at (97, 42, 5) we have pixel value 0.0156. This notation is called a Sparse Representation since it only gives the active pixel sites for a large sparse matrix. Here, there are only two clusters, since np.unique(labels) gives 0 and 2, which are the integer labels of the clusters. We have one label per pixel site. 

## TODO: 
### 1. Make 3D Visualization for Sparse Matrices
### 2. Run DBSCAN on each cluster example and see how it performs. 

### Network Architectures for Autoencoders

In [None]:
class SparseUResNet(torch.nn.Module):
    def __init__(self, dimension=3, size=192, nFeatures=16, depth=5, nClasses=1):
        super(SparseUResNet, self).__init__()
        self.dimension = dimension
        self.size = size
        self.nFeatures = nFeatures
        self.depth = depth
        self.nClasses = nClasses
        reps = 2  # Conv block repetition factor
        kernel_size = 2  # Use input_spatial_size method for other values?
        m = nFeatures  # Unet number of features
        nPlanes = [i*m for i in range(1, depth+1)]  # UNet number of features per level
        nInputFeatures = 1
        # From Submanifold Sparse Convnet Github Repo by Benjamin Graham.
        self.sparseModel = scn.Sequential().add(
           scn.InputLayer(dimension, size, mode=3)).add(
           scn.SubmanifoldConvolution(dimension, nInputFeatures, m, 3, False)).add( # Kernel size 3, no bias
           scn.UNet(dimension, reps, nPlanes, residual_blocks=True, downsample=[kernel_size, 2])).add(  # downsample = [filter size, filter stride]
           scn.BatchNormReLU(m)).add(
           scn.OutputLayer(dimension))
        self.linear = torch.nn.Linear(m, nClasses)

    def forward(self, x):
        """
        x is scn coordinate, feature input
        """
        x = self.sparseModel(x)
        x = self.linear(x)
        return x

In [None]:
class LArCVEncoder(nn.Module):
    def __init__(self, dim=3, size=192, nFeatures=16, depth=5, nClasses=1, leakiness=0):
        super(LArCVEncoder, self).__init__()

        nIn = nFeatures
        nOut = nIn * 2
        self.convLayers = []
        self.convPoolLayers = []
        self.bnReLUs = []
        
        self.inputLayer = scn.InputLayer(dim, size, mode=3)
        self.convLayers.append(scn.Convolution(dim, 1, nIn, 3, 1, False))
        self.bnReLUs.append(scn.BatchNormLeakyReLU(nIn))
        self.convPoolLayers.append(scn.Convolution(dim, nIn, nIn, 2, 2, False))
        for _ in range(5):
            self.convLayers.append(
                scn.SubmanifoldConvolution(dim, nIn, nOut, 3, False))
            self.bnReLUs.append(scn.BatchNormLeakyReLU(nOut))
            self.convPoolLayers.append(
                scn.Convolution(dim, nOut, nOut, 2, 2, False))
            nIn = nOut
            nOut = nIn * 2
        self.convLayers.append(scn.Convolution(dim, nIn, nOut, 3, 1, False))

    def forward(self, x):
        """
        x is scn coordinate, feature input
        """
        x = self.inputLayer(x)
        for i in range(len(self.bnReLUs)):
            x = self.convLayers[i](x)
            print('-' * 20)
            print(x)
            x = self.bnReLUs[i](x)
            print('-' * 20)
            print(x)
            x = self.convPoolLayers[i](x)
            print('-' * 20)
            print(x)
        x = self.convLayers[-1](x)
        return x

In [None]:
model = LArCVEncoder()
model(data)

In [None]:
def train(model, loader, optimizer, epochs, loss_fn=nn.MSELoss(), use_cuda=False, log=2):
    """
    Function for training SparseUResNet.
    """
    model.train()
    iteration = 0
    for ep in range(epochs):
        minibatch = torch.zeros
        for batch_idx, entry in enumerate(loader):
            data, label = entry
            optimizer.zero_grad()
            batch_ids = [torch.ones((t[0].shape[0], 1),
                dtype=torch.long) * batch_idx for t in data]
            coords = [torch.cat([t[0], batch_ids[i]], dim=1) 
                for i, t in enumerate(data)]
            coords = torch.cat(coords, dim=0)
            energy = [t[1] for t in data]
            values = torch.cat(energy, dim=0)
            if use_cuda:
                coords, values = coords.cuda(), values.cuda()
            out = model((coords, values))
            loss = loss_fn.forward(out, values)
            loss.backward()
            optimizer.step()

            if iteration % log == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    ep, batch_idx * len(label), len(loader.dataset),
                    100. * batch_idx / len(loader), loss.item()))
            iteration += 1


In [None]:
p = {}
p['n_epochs'] = 100
p['initial_lr'] = 1e-1
p['lr_decay'] = 4e-2
p['weight_decay'] = 1e-4
p['momentum'] = 0.9
p['check_point'] = False
p['use_cuda'] = torch.cuda.is_available()

model = SparseUResNet()
criterion = nn.MSELoss()

if p['use_cuda']:
    model.cuda()
    criterion.cuda()
optimizer = optim.SGD(model.parameters(),
    lr=p['initial_lr'],
    momentum=p['momentum'],
    weight_decay=p['weight_decay'],
    nesterov=True)

train(model, trainloader, optimizer, 1, loss_fn=criterion, use_cuda=use_cuda, log=2)

In [None]:
model.eval()

In [None]:
voxels, energy, labels = ireader.get_image(0)

In [None]:
torch.ones(t[0].shape[0], 1, dtype=torch.long) * 0

In [None]:
images
coords = [torch.cat([t[0], torch.ones(t[0].shape[0], 1, dtype=torch.long) * i], dim=1) for i, t in enumerate(images)]
coords = torch.cat(coords, dim=0)
print(coords, coords.shape)
values = torch.cat([t[1] for t in images], dim=0)
print(values, values.shape)

In [None]:
coords, values = coords.cuda(), values.cuda()
data = (coords, values)

In [None]:
out = model(data)

In [None]:
d = out - data[1]

In [None]:
criterion = nn.MSELoss()
criterion.forward(out, data[1])

In [None]:
batch_id = torch.ones((voxels.shape[0], 1), dtype=torch.long) * 1
print(batch_id)
data = coords
x = (data, y)
print(x)

In [None]:
model = SparseUResNet(3, 192, 16, 5)
root = '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10'
dataset = ClusteringAEData(root, 192)
loader = DataLoader(dataset, batch_size=64, shuffle=True, 
                    collate_fn=ae_collate, num_workers=1)

criterion = nn.MSELoss()
p = {}
p['n_epochs'] = 100
p['initial_lr'] = 1e-1
p['lr_decay'] = 4e-2
p['weight_decay'] = 1e-4
p['momentum'] = 0.9
p['check_point'] = False
p['use_cuda'] = torch.cuda.is_available()

if p['use_cuda']:
    model.cuda()
    criterion.cuda()
optimizer = optim.SGD(model.parameters(),
    lr=p['initial_lr'],
    momentum=p['momentum'],
    weight_decay=p['weight_decay'],
    nesterov=True)

dtype = 'torch.cuda.FloatTensor' if p['use_cuda'] else 'torch.FloatTensor'
dtypei = 'torch.cuda.LongTensor' if p['use_cuda'] else 'torch.LongTensor'