# Sparse Submanifold Autoencoders

### Run this notebook inside a directory that contains dlp_opendata_api folder

In [2]:
# Import Dependencies

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#%matplotlib notebook

import sys
sys.path.append("../new_notebooks/ipynb/dlp_opendata_api")
sys.path.append("../new_notebooks/ipynb")
from osf.image_api import image_reader_3d
from osf.particle_api import *
from osf.cluster_api import *

from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sparseconvnet as scn
import glob
import os.path as osp
import numpy as np

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

Welcome to JupyROOT 6.14/04


In [None]:
#ls /gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10
use_cuda = torch.cuda.is_available()

### Check if CUDA is working (GPU)

In [1]:
print(use_cuda)

NameError: name 'use_cuda' is not defined

In [None]:
class ClusteringAEData(Dataset):
    """
    A customized data loader for clustering.
    """
    def __init__(self, root, numPixels=192, filenames=None):
        """
        Initialize Clustering Dataset

        Inputs:
            - root: root directory of dataset
            - preload: if preload dataset into memory.
        """
        self.cluster_filenames = []
        self.energy_filenames = []
        self.root = root
        self.numPixels = str(numPixels)
        
        if filenames:
            self.energy_filenames = filenames[0]
            self.cluster_filenames = filenames[1]
            print(self.energy_filenames)

        self.energy_filenames.sort()
        self.cluster_filenames.sort()
        self.cluster_reader = cluster_reader(*self.cluster_filenames)
        self.energy_reader = image_reader_3d(*self.energy_filenames)
        self.len = self.energy_reader.entry_count()
        assert self.len == self.cluster_reader.entry_count()

    def __getitem__(self, index):
        """
        Get a sample from dataset.
        """
        voxel, label = self.cluster_reader.get_image(index)
        _, energy, _ = self.energy_reader.get_image(index)
        voxel, label = torch.from_numpy(voxel), torch.from_numpy(label)
        energy = torch.from_numpy(energy)
        energy = torch.unsqueeze(energy, dim=1)
        label = torch.unsqueeze(label, dim=1).type(torch.LongTensor)
        return (voxel, energy), label

    def __len__(self):
        """
        Total number of sampels in dataset.
        """
        return self.len

In [None]:
def ae_collate(batch):
    """
    Custom collate_fn for Autoencoder.
    """
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    return [data, target]

### Get Train, Dev, and Test Set

In [None]:
root = '/gpfs/slac/staas/fs1/g/neutrino/kterao/data/dlprod_ppn_v10' #replace with your own path to root folder. 
trainset_cluster = [root + '/cluster/dlprod_cluster_192px_0{}.root'.format(i) for i in range(8)]
devset_cluster = [root + '/cluster/dlprod_cluster_192px_0{}.root'.format(8)]
#testset_cluster = [root + '/cluster/dlprod_cluster_192px_0{}.root'.format(9)]

trainset_energy = [root + '/dlprod_192px_0{}.root'.format(i) for i in range(8)]
devset_energy = [root + '/dlprod_192px_0{}.root'.format(8)]
#testset_energy = [root + '/dlprod_192px_0{}.root'.format(9)]

for i, f in enumerate(trainset_cluster):
    print(f)
    print(trainset_energy[i])
    
for i, f in enumerate(devset_cluster):
    print(f)
    print(devset_energy[i])
    
#for i, f in enumerate(testset_cluster):
#    print(f)
#    print(testset_energy[i])

trainset = ClusteringAEData(root, 192, filenames=[trainset_energy, trainset_cluster])
devset = ClusteringAEData(root, 192, filenames=[devset_energy, devset_cluster])
#testset = ClusteringAEData(root, 192, filenames=[testset_energy, testset_cluster])
print('Number of entries in training set: {}'.format(len(trainset)))
print('Number of entries in validation set: {}'.format(len(devset)))
#print('Number of entries in test set: {}'.format(len(testset)))

In [None]:
trainloader = DataLoader(trainset, batch_size=1, shuffle=True, collate_fn=ae_collate, num_workers=0, pin_memory=False)
devloader = DataLoader(devset, batch_size=1, shuffle=True, collate_fn=ae_collate, num_workers=0, pin_memory=False)

In [None]:
entry, labels = trainset[48]
coords, energy = entry
# coords refer to coordinates of each pixel
print(coords)
# energy refer to pixel energy values
print(energy)
# labels refer to cluster labels
print(labels)
print("How many distinct clusters: {}".format(np.unique(labels)))

## Visualize Dataset

In [None]:
from localutil.data import *
from localutil.visualization import *
from plotly.offline import init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
#import plotly.io as pio

In [None]:
# Code by Brad Nelson, I redefine the localutil.visualization modules 
# for better pictures (I set size of markers = 1)

def scatter_energy(img_reader, n, threshold=0.0):
    """
    Creates graph object for energy scatter plot
    Args:
        img_reader (image_reader_3d)
        n (int): image to plot
        threshold (optional): threshold energies below a certain value
    """
    voxels, energy, labels = img_reader.get_image(n)
    inds = energy > threshold
    trace = go.Scatter3d(x=voxels[inds,0], y=voxels[inds,1], z=voxels[inds,2],
                    mode='markers',
                    marker = dict(
                        size = 1,
                        color = np.log(energy[inds]),
                        colorscale='Viridis',
                        opacity=0.8
                    ), hovertext=energy)
    return trace

def scatter_classes(img_reader, n):
    voxels, types = img_reader.get_image(n)
    trace = go.Scatter3d(x=voxels[:,0], y=voxels[:,1], z=voxels[:,2],
                    mode='markers',
                    marker = dict(
                        size = 1,
                        color = types,
                        colorscale='Viridis',
                        opacity=0.8
                    ), hovertext=types)
    return trace

def scatter_clusters(voxels, clusters):
    trace = go.Scatter3d(x=voxels[:,0], y=voxels[:,1], z=voxels[:,2],
                    mode='markers',
                    marker = dict(
                        size = 1,
                        color = clusters,
                        colorscale='Viridis',
                        opacity=0.8
                    ), hovertext=clusters)
    return trace

In [None]:
eventn = 48
trace1 = scatter_classes(trainset.cluster_reader, eventn)
trace2 = scatter_energy(trainset.energy_reader, eventn)
fig = go.Figure(data=[trace1])
iplot(fig)
#pio.write_image(fig, 'images/fig1.pdf')

In [None]:
print(coords)
print(energy)
print(labels)

In [None]:
import pandas as pd
labels_plotting = labels.numpy().astype(int)
labels_plotting = pd.Series(labels_plotting)
labels_plotting

In [None]:
colorsIdx = {0: 'rgb(31, 119, 180)', 1: 'rgb(255, 127, 14)',
             2: 'rgb(44, 160, 44)', 3: 'rgb(214, 39, 40)',
             4: 'rgb(148, 103, 189)', 5:'rgb(140, 86, 75)',
             6: 'rgb(227, 119, 194)', 7: 'rgb(127, 127, 127)',
             8: 'rgb(188, 189, 34)', 9: 'rgb(23, 190, 207)',
             17: 'rgb(255,234,0)', 18: 'rgb(255,111,0)',
             24: 'rgb(150,0,90)', 42: 'rgb(0,0,200)'}
cols      = labels_plotting.map(colorsIdx)

trace = go.Scatter3d(x=coords[:,0], y=coords[:,1], z=coords[:,2],
                    mode='markers',
                    marker = dict(
                        size = 1,
                        color = cols,
                        opacity=0.8
                    ), hovertext=labels_plotting)
fig = go.Figure(data=[trace])
iplot(fig)

Here, each row of 'coords' is the x,y,z coordinates of the active pixels of the 3D tensor, and each row of 'y' is the value of the pixel at the corresponding 'coords' coordinates. That is, at (97, 42, 5) we have pixel value 0.0156. This notation is called a Sparse Representation since it only gives the active pixel sites for a large sparse matrix. Here, there are only two clusters, since np.unique(labels) gives 0 and 2, which are the integer labels of the clusters. We have one label per pixel site. 

## TODO: 
### 1. Make 3D Visualization for Sparse Matrices
### 2. Run DBSCAN on each cluster example and see how it performs. 

## Get Pretrained UResNet Module

In [None]:
class UResNet(torch.nn.Module):
    def __init__(self, dim=3, size=192, nFeatures=16, depth=5, nClasses=5):
        import sparseconvnet as scn
        super(UResNet, self).__init__()
        #self._flags = flags
        dimension = dim
        reps = 2  # Conv block repetition factor
        kernel_size = 2  # Use input_spatial_size method for other values?
        m = nFeatures  # Unet number of features
        nPlanes = [i*m for i in range(1, depth+1)]  # UNet number of features per level
        # nPlanes = [(2**i) * m for i in range(1, num_strides+1)]  # UNet number of features per level
        nInputFeatures = 1
        self.sparseModel = scn.Sequential().add(
           scn.InputLayer(dimension, size, mode=3)).add(
           scn.SubmanifoldConvolution(dimension, nInputFeatures, m, 3, False)).add( # Kernel size 3, no bias
           scn.UNet(dimension, reps, nPlanes, residual_blocks=True, downsample=[kernel_size, 2])).add(  # downsample = [filter size, filter stride]
           scn.BatchNormReLU(m)).add(
           scn.OutputLayer(dimension))
        self.linear = torch.nn.Linear(m, nClasses)

    def forward(self, point_cloud):
        """
        point_cloud is a list of length minibatch size (assumes mbs = 1)
        point_cloud[0] has 3 spatial coordinates + 1 batch coordinate + 1 feature
        shape of point_cloud[0] = (N, 4)
        """
        #coords = point_cloud[:, 0:-1].float()
        #features = point_cloud[:, -1][:, None].float()
        x = self.sparseModel(point_cloud)
        x = self.linear(x)
        return x

In [None]:
def get_unet(fname, dimension=3, size=192, nFeatures=16, depth=5, nClasses=5):
    model = UResNet(dim=dimension, size=size, nFeatures=nFeatures, depth=depth, nClasses=nClasses)
    model = nn.DataParallel(model)
    #print(model.state_dict().keys())
    checkpoint = torch.load(fname, map_location='cpu')
    #print()
    #print(checkpoint['state_dict'].keys())
    model.load_state_dict(checkpoint['state_dict'], strict=True)
    # just return the pre-trained unet
    return model.module.sparseModel

In [None]:
fname = '/gpfs/slac/staas/fs1/g/neutrino/.scn_paper/new/sparse_is192_uns5_uf16_bs64/weights3/snapshot-29999.ckpt'
#unet = get_unet(fname)
unet = UResNet()
unet

In [None]:
def save_checkpoint(checkpoint_path, model, optimizer):
    # state_dict: a Python dictionary object that:
    # - for a model, maps each layer to its parameter tensor;
    # - for an optimizer, contains info about the optimizer’s states and hyperparameters used.
    state = {
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict()}
    torch.save(state, checkpoint_path)
    print('model saved to %s' % checkpoint_path)

In [None]:
def load_checkpoint(checkpoint_path, model, optimizer):
    state = torch.load(checkpoint_path)
    model.load_state_dict(state['state_dict'])
    optimizer.load_state_dict(state['optimizer'])
    print('model loaded from %s' % checkpoint_path)

In [None]:
from loss import DiscriminativeLoss

In [None]:
criterion = DiscriminativeLoss()

In [None]:
unet = unet.cuda()

In [None]:
training_epochs=10
#training_epoch=scn.checkpoint_restore(unet,exp_name,'unet',use_cuda)
optimizer = optim.Adam(unet.parameters())
print('#classifer parameters', sum([x.nelement() for x in unet.parameters()]))

In [None]:
import csv

In [None]:
f_loss = open('loss.csv', 'w')
f_acc = open('acc.csv', 'w')
lossWriter = csv.writer(f_loss, delimiter=',')
accWriter = csv.writer(f_acc, delimiter=',')

In [None]:
import time

trainset_len = len(trainset)

for epoch in range(1, training_epochs+1):
    unet.train()
    stats = {}
    scn.forward_pass_multiplyAdd_count=0
    scn.forward_pass_hidden_states=0
    start = time.time()
    train_loss=0
    for i,batch in enumerate(trainloader):
        optimizer.zero_grad()
        data = batch[0]
        label = batch[1][0]
        coord, energy = data[0]
        if use_cuda:
            coord, energy = coord.cuda(), energy.cuda()
        try:
            out = unet((coord, energy))
            out = out.cpu()
            loss = criterion(out, label)
            train_loss+=loss.item()
            loss.backward()
            print("Examples = {}/{}, Loss = {}".format(i+1, trainset_len, loss))
            optimizer.step()
            lossWriter.writerow([loss.item()])
        except:
            print("Warning: Error Encounterd!!")
            continue
    print(epoch,'Train loss',train_loss/(i+1), 
          'MegaMulAdd=',scn.forward_pass_multiplyAdd_count/len(data.train)/1e6, 
          'MegaHidden',scn.forward_pass_hidden_states/len(data.train)/1e6,
          'time=',time.time() - start,'s')
#scn.checkpoint_save(unet,exp_name,'unet',epoch, use_cuda)

In [None]:
input_test = scn.InputLayer(3, 192, mode=3)

In [None]:
input_test

In [None]:
trainiter = iter(trainloader)
batch = trainiter.next()
print(batch)

In [None]:
x_batch = batch[0]
y_batch = batch[1]

In [None]:
unet(x_batch[0])

In [None]:
coord, energy = x_batch[0]
coord = coord.cuda()
energy = energy.cuda()

In [None]:
out = unet((coord, energy))

In [None]:
criterion(out, y_batch[0])

In [None]:
y_batch