## General modules

In [1]:
__author__ = 'tkurth'
import sys
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from nbfinder import NotebookFinder
sys.meta_path.append(NotebookFinder())
%matplotlib inline
import time

## Theano modules

In [2]:
import theano
import theano.tensor as T
import theano.sandbox.rng_mrg
Trng = theano.sandbox.rng_mrg.MRG_RandomStreams(9)
import lasagne as ls

## ROOT stuff

In [3]:
sys.path.append('/global/homes/w/wbhimji/cori-envs/nersc-rootpy/lib/python2.7/site-packages/')
sys.path.append('/global/common/cori/software/root/6.06.06/lib/root')
import ROOT
import rootpy
import root_numpy as rnp

Welcome to ROOTaaS 6.06/06


## Useful functions

In [4]:
# Define a context manager to suppress stdout and stderr.
class suppress_stdout_stderr(object):
    '''
    A context manager for doing a "deep suppression" of stdout and stderr in 
    Python, i.e. will suppress all print, even if the print originates in a 
    compiled C/Fortran sub-function.
       This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
    exited (at least, I think that is why it lets exceptions through).      

    '''
    def __init__(self):
        # Open a pair of null files
        self.null_fds =  [os.open(os.devnull,os.O_RDWR) for x in range(2)]
        # Save the actual stdout (1) and stderr (2) file descriptors.
        self.save_fds = (os.dup(1), os.dup(2))

    def __enter__(self):
        # Assign the null pointers to stdout and stderr.
        os.dup2(self.null_fds[0],1)
        os.dup2(self.null_fds[1],2)

    def __exit__(self, *_):
        # Re-assign the real stdout/stderr back to (1) and (2)
        os.dup2(self.save_fds[0],1)
        os.dup2(self.save_fds[1],2)
        # Close the null files
        os.close(self.null_fds[0])
        os.close(self.null_fds[1])

## Data loader and preprocessor

In [154]:
def load_data(bg_cfg_file = '../config/BgFileListAug16.txt',
                sig_cfg_file='../config/SignalFileListAug16.txt',
                num_files=10,  
                group_name='CollectionTree',
                branches=['CaloCalTopoClustersAuxDyn.calPhi', \
                          'CaloCalTopoClustersAuxDyn.calEta', \
                          'CaloCalTopoClustersAuxDyn.calE'],
                dataset_name='histo',
                type_='root'):

    #get list of files
    bg_files = [line.rstrip() for line in open(bg_cfg_file)]
    sig_files = [line.rstrip() for line in open(sig_cfg_file)]
    
    #so we don't have annoying stderr messages
    with suppress_stdout_stderr():
            
        #bgarray has n_events groups of 3 parallel numpy arrays 
        #(each numpy within a group is of equal length and each array corresponds to phi, eta and the corresponding energy)
        bgarray = rnp.root2array(bg_files[:num_files], \
                                treename=group_name, \
                                branches=branches, \
                                start=0, \
                                warn_missing_tree=True)

        sigarray = rnp.root2array(sig_files[:num_files],\
                                treename=group_name,\
                                branches=branches,\
                                start=0, \
                                warn_missing_tree=True)
        
    #create dataframe with all entries
    #store in dataframe
    bgdf = pd.DataFrame.from_records(bgarray)
    bgdf['label']=0
    sigdf = pd.DataFrame.from_records(sigarray)
    sigdf['label']=1
    
    #concat
    return pd.concat([bgdf,sigdf])


#preprocessor
def preprocess_data(df,eta_range,phi_range,eta_bins,phi_bins):
    #empty array
    xvals = np.zeros((df.shape[0], 1, phi_bins, eta_bins ),dtype='float32')
    yvals = np.zeros((df.shape[0],),dtype='int32')
    
    for i in range(df.shape[0]):        
        phi, eta, E =  df.iloc[i]['CaloCalTopoClustersAuxDyn.calPhi'],\
                       df.iloc[i]['CaloCalTopoClustersAuxDyn.calEta'],\
                       df.iloc[i]['CaloCalTopoClustersAuxDyn.calE']
        
        xvals[i]=np.histogram2d(phi,eta,
                                bins=(phi_bins, eta_bins), \
                                weights=E,
                                range=[phi_range,eta_range])[0]
        yvals[i]=df.iloc[i]['label']
        
    return xvals, yvals

## Data iterator

In [163]:
class hep_data_iterator:
    
    #class constructor
    def __init__(self,
                 datadf,
                 shuffle=True,
                 bin_size=0.025,
                 eta_range = [-5,5],
                 phi_range = [-3.14, 3.14]
                ):

        #set parameters
        self.shuffle = shuffle
        self.bin_size = bin_size
        self.eta_range = eta_range
        self.phi_range = phi_range
        
        #compute bins
        self.eta_bins = int(np.floor((self.eta_range[1] - self.eta_range[0]) / self.bin_size))
        self.phi_bins = int(np.floor((self.phi_range[1] - self.phi_range[0]) / self.bin_size))
        
        #dataframe
        self.df = datadf
        self.df.sort_values(by='label',inplace=True)
        
        #make class frequencies even:
        tmpdf=self.df.groupby('label').count().reset_index()
        self.num_classes=tmpdf.shape[0]
        
        #determine minimum frequency
        min_frequency=tmpdf['CaloCalTopoClustersAuxDyn.calE'].min()
        tmpdf=self.df.groupby(['label']).apply(lambda x: x[['CaloCalTopoClustersAuxDyn.calPhi', \
                                                            'CaloCalTopoClustersAuxDyn.calEta', \
                                                            'CaloCalTopoClustersAuxDyn.calE']].iloc[:min_frequency,:]).copy()
        tmpdf.reset_index(inplace=True)
        del tmpdf['level_1']
        self.df=tmpdf.copy()
        
        #compute max:
        self.compute_data_max()
        
        #shuffle if wanted (highly recommended)
        if self.shuffle:
            self.df=self.df.reindex(np.random.permutation(self.df.index))
        
        #number of examples
        self.num_examples=self.df.shape[0]
        
        #shapes:
        self.xshape=(1, self.phi_bins, self.eta_bins)
        
    
    #compute max over all data
    def compute_data_max(self):
        '''compute the maximum over all event entries for rescaling data between -1 and 1'''
        self.max_abs=(self.df['CaloCalTopoClustersAuxDyn.calE'].abs()).apply(lambda x: np.max(x)).max()
    
    
    #this is the batch iterator:
    def next_batch(self,batchsize):
        '''batch iterator'''
        
        #shuffle:
        if self.shuffle:
            self.df=self.df.reindex(np.random.permutation(self.df.index))
        
        #iterate
        for idx in range(0,self.num_examples-batchsize,batchsize):
            #yield next batch
            x,y=preprocess_data(self.df.iloc[idx:idx+batchsize,:],\
                             self.eta_range,
                             self.phi_range,
                             self.eta_bins,self.phi_bins)
            #rescale x:
            x/=self.max_abs
        
            #return result
            yield x,y

## Construct data iterator

In [164]:
#parameters
train_fraction=0.8
binsize=0.1
numfiles=2

#load data
datadf=load_data(num_files=numfiles)

#create views for different labels
sigdf=datadf[ datadf.label==1 ]
bgdf=datadf[ datadf.label==0 ]

#split the sets
num_sig_train=int(np.floor(sigdf.shape[0]*train_fraction))
num_bg_train=int(np.floor(bgdf.shape[0]*train_fraction))
traindf=pd.concat([bgdf.iloc[:num_bg_train],sigdf.iloc[:num_sig_train]])
validdf=pd.concat([bgdf.iloc[num_bg_train:],sigdf.iloc[num_sig_train:]])

#create iterators
hditer_train=hep_data_iterator(traindf,bin_size=binsize)
hditer_validation=hep_data_iterator(validdf,bin_size=binsize)

#the preprocessing for the validation iterator has to be taken from the training iterator
hditer_validation.max_abs=hditer_train.max_abs

In [165]:
print hditer_train.num_examples
print hditer_validation.num_examples

31390
7848


# Classifier

## Construct classification network

In [166]:
#some parameters
keep_prob=0.5
num_filters=128
num_units_dense=1024
initial_learning_rate=0.001

#input layer
l_inp_data = ls.layers.InputLayer((None,hditer_train.xshape[0],hditer_train.xshape[1],hditer_train.xshape[2]))
l_inp_label = ls.layers.InputLayer((None,1))

#conv layers
#first layer
l_conv1 = ls.layers.Conv2DLayer(incoming=l_inp_data,
                                num_filters=num_filters,
                                filter_size=3,
                                stride=(1,1),
                                pad=0,
                                W=ls.init.HeUniform(),
                                b=ls.init.Constant(0.),
                                nonlinearity=ls.nonlinearities.LeakyRectify()
                               )
l_drop1 = ls.layers.DropoutLayer(incoming=l_conv1,
                       p=keep_prob,
                       rescale=True
                      )
l_pool1 = ls.layers.MaxPool2DLayer(incoming=l_drop1,
                                   pool_size=(2,2),
                                   stride=2,
                                   pad=0                                   
                                  )

#second layer:
l_conv2 = ls.layers.Conv2DLayer(incoming=l_pool1,
                                num_filters=num_filters,
                                filter_size=3,
                                stride=(1,1),
                                pad=0,
                                W=ls.init.HeUniform(),
                                b=ls.init.Constant(0.),
                                nonlinearity=ls.nonlinearities.LeakyRectify()
                               )
l_drop2 = ls.layers.DropoutLayer(incoming=l_conv2,
                       p=keep_prob,
                       rescale=True
                      )
l_pool2 = ls.layers.MaxPool2DLayer(incoming=l_drop2,
                                   pool_size=(2,2),
                                   stride=2,
                                   pad=0                                   
                                  )

#third layer:
l_conv3 = ls.layers.Conv2DLayer(incoming=l_pool2,
                                num_filters=num_filters,
                                filter_size=3,
                                stride=(1,1),
                                pad=0,
                                W=ls.init.HeUniform(),
                                b=ls.init.Constant(0.),
                                nonlinearity=ls.nonlinearities.LeakyRectify()
                               )

l_drop3 = ls.layers.DropoutLayer(incoming=l_conv3,
                       p=keep_prob,
                       rescale=True
                      )
l_pool3 = ls.layers.MaxPool2DLayer(incoming=l_drop3,
                                   pool_size=(2,2),
                                   stride=2,
                                   pad=0                                   
                                  )

#fourth layer:
l_conv4 = ls.layers.Conv2DLayer(incoming=l_pool3,
                                num_filters=num_filters,
                                filter_size=3,
                                stride=(1,1),
                                pad=0,
                                W=ls.init.HeUniform(),
                                b=ls.init.Constant(0.),
                                nonlinearity=ls.nonlinearities.LeakyRectify()
                               )
l_drop4 = ls.layers.DropoutLayer(incoming=l_conv4,
                       p=keep_prob,
                       rescale=True
                      )
l_pool4 = ls.layers.MaxPool2DLayer(incoming=l_drop4,
                                   pool_size=(2,2),
                                   stride=2,
                                   pad=0                                   
                                  )

#flatten
l_flat = ls.layers.FlattenLayer(incoming=l_pool4, 
                                outdim=2)

#crossfire
l_fc1 = ls.layers.DenseLayer(incoming=l_flat, 
                             num_units=num_units_dense, 
                             W=ls.init.GlorotUniform(np.sqrt(2./(1+0.01**2))), 
                             b=ls.init.Constant(0.0),
                             nonlinearity=ls.nonlinearities.LeakyRectify()
                            )

l_drop5 = ls.layers.DropoutLayer(incoming=l_fc1,
                       p=keep_prob,
                       rescale=True
                      )

l_fc2 = ls.layers.DenseLayer(incoming=l_drop5, 
                             num_units=num_units_dense, 
                             W=ls.init.GlorotUniform(np.sqrt(2./(1+0.01**2))), 
                             b=ls.init.Constant(0.0),
                             nonlinearity=ls.nonlinearities.LeakyRectify()
                            )

l_drop6 = ls.layers.DropoutLayer(incoming=l_fc2,
                       p=keep_prob,
                       rescale=True
                      )

#output layer
l_out = ls.layers.DenseLayer(incoming=l_drop6, 
                             num_units=hditer_train.num_classes, 
                             W=ls.init.GlorotUniform(np.sqrt(2./(1+0.01**2))), 
                             b=ls.init.Constant(0.0),
                             nonlinearity=ls.nonlinearities.softmax
                            )

#network
network = [l_inp_data, l_inp_label,
           l_conv1, l_pool1, l_drop1,
           l_conv2, l_pool2, l_drop2,
           l_conv3, l_pool3, l_drop3,
           l_conv4, l_pool4, l_drop4,
           l_flat, 
           l_fc1, l_drop5,
           l_fc2, l_drop6,
           l_out
          ]

#variables
inp = l_inp_data.input_var
lab = T.ivector('lab')

#output
lab_pred = ls.layers.get_output(l_out, {l_inp_data: inp})
lab_pred_det = ls.layers.get_output(l_out, {l_inp_data: inp}, deterministic=True)

#loss functions:
loss = ls.objectives.categorical_crossentropy(lab_pred,lab).mean()
loss_det = ls.objectives.categorical_crossentropy(lab_pred_det,lab).mean()

#accuracy
acc_det = ls.objectives.categorical_accuracy(lab_pred_det, lab, top_k=1).mean()

#parameters
params = ls.layers.get_all_params(network, trainable=True)

#updates
updates = ls.updates.adam(loss, params, learning_rate=initial_learning_rate)

#compile network function
fnn = theano.function([inp], lab_pred)
#training function to minimize
fnn_train = theano.function([inp,lab], loss, updates=updates)
#validation function with accuracy
fnn_validate = theano.function([inp,lab], [loss_det,acc_det])

## Training

In [None]:
num_epochs=10
batchsize=128

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    train_err = 0.
    train_acc = 0.
    train_acc_evan = 0.
    train_batches = 0.
    start_time = time.time()
    for batch in hditer_train.next_batch(batchsize):
        inputs, targets = batch
        train_err += fnn_train(inputs, targets)
        train_batches += 1.
        
        #print accurarcy on training sample:
        _, acc = fnn_validate(inputs, targets)
        train_acc += acc
        
        #debugging output
        print 'train: ', train_err/train_batches, train_acc/train_batches*100.
        
    # And a full pass over the validation data:
    val_err = 0.
    val_acc = 0.
    val_batches = 0.
    for batch in hditer_validation.next_batch(batchsize):
        inputs, targets = batch            
        err, acc = fnn_validate(inputs, targets)
        val_err += err
        val_acc += acc
        val_batches += 1.

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
    print("  training accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100.))
    print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
    print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100.))

train:  0.822142342674 44.53125
train:  0.80257628073 47.265625
train:  0.69352667745 64.84375
train:  0.6290173058 73.046875
train:  0.590914222838 77.03125
train:  0.553971015908 80.3385416667
train:  0.514055979484 83.0357142857
train:  0.475199687026 85.15625
train:  0.42902398588 86.71875
train:  0.387121941983 88.046875
train:  0.352257142271 89.1335227273
train:  0.322909133736 90.0390625
train:  0.298070236123 90.8052884615
train:  0.276907051216 91.40625
train:  0.258450926035 91.7708333333
train:  0.242330665132 91.9921875
train:  0.228075932513 92.2334558824
train:  0.2154067823 92.3177083333
train:  0.204070536372 92.3930921053
train:  0.201177040918 92.734375
train:  0.191597181826 93.0803571429
train:  0.323997310285 93.3238636364
train:  0.309910470707 92.6970108696
train:  0.337635214 91.1458333333
train:  0.519341107137 90.25
train:  0.532386507261 90.234375
train:  0.525015335051 90.3356481481
train:  0.506264787392 90.6808035714
train:  0.48880738093 91.0021551724
tr

In [None]:
def plot_example(x):
    plt.imshow(np.log10(x).T,extent=[-3.15, 3.15, -5, 5], interpolation='none',aspect='auto', origin='low')
    plt.colorbar()

#for batch in hditer_validation:
#    inputs,targets=batch
#    plot_example(inputs[0,0,:,:])
#    break

for batch in hditer_train:
    inputs,targets=batch
    plot_example(inputs[0,0,:,:])
    break;