In [1]:
__author__ = 'tkurth'
import sys
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
from nbfinder import NotebookFinder
sys.meta_path.append(NotebookFinder())
%matplotlib inline
import time
from tqdm import tqdm
import re
#sys.path.append('/global/homes/w/wbhimji/cori-envs/nersc-rootpy/lib/python2.7/site-packages/')
#sys.path.append('/global/common/cori/software/root/6.06.06/lib/root')
#import ROOT
#import rootpy
#import root_numpy as rnp
import h5py as h5

## Useful functions

In [2]:
# Define a context manager to suppress stdout and stderr.
class suppress_stdout_stderr(object):
    '''
    A context manager for doing a "deep suppression" of stdout and stderr in 
    Python, i.e. will suppress all print, even if the print originates in a 
    compiled C/Fortran sub-function.
       This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
    exited (at least, I think that is why it lets exceptions through).      

    '''
    def __init__(self):
        # Open a pair of null files
        self.null_fds =  [os.open(os.devnull,os.O_RDWR) for x in range(2)]
        # Save the actual stdout (1) and stderr (2) file descriptors.
        self.save_fds = (os.dup(1), os.dup(2))

    def __enter__(self):
        # Assign the null pointers to stdout and stderr.
        os.dup2(self.null_fds[0],1)
        os.dup2(self.null_fds[1],2)

    def __exit__(self, *_):
        # Re-assign the real stdout/stderr back to (1) and (2)
        os.dup2(self.save_fds[0],1)
        os.dup2(self.save_fds[1],2)
        # Close the null files
        os.close(self.null_fds[0])
        os.close(self.null_fds[1])

In [3]:
def merge_dicts(dict1,dict2):
    tmp = dict1.copy()
    tmp.update(dict2)
    return tmp

In [4]:
#file string parser
def parse_filename(fname,directory='.'):
    directory=re.sub(r'^(.*?)(/+)$',r'\1',directory)
    
    #signal file?
    smatch=re.compile(r'^GG_RPV(.*?)_(.*?)_(.*?)\.h5')
    tmpres=smatch.findall(fname)
    if tmpres:
        tmpres=tmpres[0]
        return {'rpv':int(tmpres[0]), 'mglu':int(tmpres[1]), 'mneu':int(tmpres[2]), 'name':directory+'/'+fname}

    #background file?
    smatch=re.compile(r'^jetjet_JZ(.*?)\.h5')
    tmpres=smatch.findall(fname)
    if tmpres:
        return {'jz':int(tmpres[0]), 'name':directory+'/'+fname}

    #nothing at all
    return {}

In [5]:
def load_data(filelists,
                group_name='CollectionTree',
                dataset_name='histo',
                type_='hdf5'):
    
    #iterate over elements in the filelists
    records=[]
    
    for fname in tqdm(filelists):
        #read specifics of that list
        masterrec=parse_filename(fname.split('/')[-1])
        #determine if it is label or background
        if 'jz' in masterrec.keys():
            masterrec['label']=0
        else:
            masterrec['label']=1
        
        #open the hdf5 file
        #we don't want annoying stderr messages
        try:
            reclist=[]
            f= h5.File(fname,'r')
            for event in f.items():
                if event[0].startswith('event'):
                    datarec={}
                    
                    #calorimeter:
                    #azimuth
                    datarec['calPhi']=event[1]['clusPhi'].value
                    #rapidity
                    datarec['calEta']=event[1]['clusEta'].value
                    #energy deposit
                    datarec['calE']=event[1]['clusE'].value
                    #EM fraction
                    datarec['calEM']=event[1]['clusEM'].value
                    
                    #tracks:
                    #azimuth
                    datarec['trackPhi']=event[1]['trackPhi'].value
                    #rapidity
                    datarec['trackEta']=event[1]['trackEta'].value
                    
                    #weight
                    datarec['weight']=event[1]['weight'].value
                    
                    #passes standard regression?
                    datarec['passSR']=event[1]['passSR'].value
                    
                    #SUSY theory masses
                    if masterrec['label']==1:
                        datarec['mGlu']=event[1]['mGlu'].value
                        datarec['mNeu']=event[1]['mNeu'].value
                    else:
                        datarec['mGlu']=0.
                        datarec['mNeu']=0.
                    
                    reclist.append(merge_dicts(masterrec,datarec))
            
            #close file
            f.close()
            
        except:
            continue
            
        #append to records
        records+=reclist
            
    #return dataframe
    return pd.DataFrame(records)


#data augmentation
def augment_data(xarr,roll_angle):
    #flip in x:
    if np.random.random_sample()>=0.5:
        xarr=np.fliplr(xarr)
    #flip in y:
    if np.random.random_sample()>=0.5:
        xarr=np.flipud(xarr)
    #roll in x with period 2pi/8
    randroll=np.random.randint(0,8,size=1)[0]
    #determine granularity:
    rollunit=randroll*roll_angle
    xarr=np.roll(xarr, shift=rollunit, axis=1)
    
    return xarr
    
    
#preprocessor
def preprocess_data(df,eta_range,phi_range,eta_bins,phi_bins):
    #empty array
    xvals = np.zeros((df.shape[0], 3, phi_bins, eta_bins ),dtype='float32')
    yvals = np.zeros((df.shape[0],),dtype='int32')
    wvals = np.zeros((df.shape[0],),dtype='float32')
    pvals = np.zeros((df.shape[0],),dtype='int32')
    mgvals = np.zeros((df.shape[0],),dtype='float32')
    mnvals = np.zeros((df.shape[0],),dtype='float32')
    
    for i in range(df.shape[0]):
        calPhi   = df.iloc[i]['calPhi']
        calEta   = df.iloc[i]['calEta']
        calE     = df.iloc[i]['calE']
        calEM    = df.iloc[i]['calEM']
        trackPhi = df.iloc[i]['trackPhi']
        trackEta = df.iloc[i]['trackEta']
        w        = df.iloc[i]['weight']
        psr      = df.iloc[i]['passSR']
        mg       = df.iloc[i]['mGlu']
        mn       = df.iloc[i]['mNeu']                                       
        
        #data
        xvals[i,0,:,:]=np.histogram2d(calPhi,calEta,
                                        bins=(phi_bins, eta_bins),
                                        weights=calE,
                                        range=[phi_range,eta_range])[0]
        xvals[i,1,:,:]=np.histogram2d(calPhi,calEta,
                                        bins=(phi_bins, eta_bins),
                                        weights=calEM,
                                        range=[phi_range,eta_range])[0]
        xvals[i,2,:,:]=np.histogram2d(trackPhi,trackEta,
                                        bins=(phi_bins, eta_bins),
                                        range=[phi_range,eta_range])[0]
        
        #obtain the rest
        wvals[i]=w
        pvals[i]=psr
        mgvals[i]=mg
        mnvals[i]=mn
        yvals[i]=df.iloc[i]['label']
        
    return xvals, yvals, wvals, pvals, mgvals, mnvals

In [33]:
class hep_data_iterator:
    
    #class constructor
    def __init__(self,
                 datadf,
                 max_frequency=None,
                 even_frequencies=True,
                 shuffle=True,
                 nbins=(100,100),
                 eta_range = [-5,5],
                 phi_range = [-3.1416, 3.1416],
                 augment=False,
                 compute_max=True
                ):

        #set parameters
        self.shuffle = shuffle
        self.nbins = nbins
        self.eta_range = eta_range
        self.phi_range = phi_range

        #even frequencies?
        self.even_frequencies=even_frequencies
        self.augment=augment
        self.compute_max=compute_max
        
        #compute bins depending on total range
        #eta
        #eta_step=(self.eta_range[1]-self.eta_range[0])/float(self.nbins[0]-1)
        #self.eta_bins = np.arange(self.eta_range[0],self.eta_range[1]+eta_step,eta_step)
        self.eta_bins=self.nbins[0]
        #phi
        #phi_step=(self.phi_range[1]-self.phi_range[0])/float(self.nbins[1]-1)
        #self.phi_bins = np.arange(self.phi_range[0],self.phi_range[1]+phi_step,phi_step)
        self.phi_bins=self.nbins[1]
        
        #dataframe
        self.df = datadf
        self.df.sort_values(by='label',inplace=True)
        
        #make class frequencies even:
        tmpdf=self.df.groupby('label').count().reset_index()
        self.num_classes=tmpdf.shape[0]
        
        #determine minimum frequency
        min_frequency=tmpdf['calE'].min()
        if max_frequency:
            min_frequency=np.min([min_frequency,max_frequency])
        elif not self.even_frequencies:
            min_frequency=-1
        
        tmpdf=self.df.groupby(['label']).apply(lambda x: x[['calPhi',
                                                            'calEta',
                                                            'calE',
                                                            'calEM',
                                                            'trackPhi',
                                                            'trackEta',
                                                            'weight',
                                                            'passSR',
                                                            'mGlu',
                                                            'mNeu'
                                                           ]].iloc[:min_frequency,:]).copy()
        
        tmpdf.reset_index(inplace=True)
        del tmpdf['level_1']
        
        #copy tmpdf into self.df:
        self.df=tmpdf.copy()
        
        #compute maxima:
        if self.compute_max:
            self.compute_data_max()
            self.compute_weight_max()
        
        #shuffle if wanted (highly recommended)
        if self.shuffle:
            self.df=self.df.reindex(np.random.permutation(self.df.index))
        
        #number of examples
        self.num_examples=self.df.shape[0]
        
        #shapes:
        self.xshape=(3, self.phi_bins, self.eta_bins)
        
    
    #compute max over all data
    def compute_data_max(self):
        '''compute the maximum over all event entries for rescaling data between -1 and 1'''
        #initialize
        self.max_abs=np.zeros(3)
        #fill
        self.max_abs[0]=self.df[['calPhi','calEta','calE']].apply(lambda x: np.max(np.histogram2d(x['calPhi'],x['calEta'],
                                                                            bins=(self.phi_bins, self.eta_bins),
                                                                            weights=x['calE'],
                                                                            range=[self.phi_range,self.eta_range])[0]),
                                                                  axis=1).max()
        self.max_abs[1]=self.df[['calPhi','calEta','calEM']].apply(lambda x: np.max(np.histogram2d(x['calPhi'],x['calEta'],
                                                                            bins=(self.phi_bins, self.eta_bins),
                                                                            weights=x['calEM'],
                                                                            range=[self.phi_range,self.eta_range])[0]),
                                                                  axis=1).max()
        self.max_abs[2]=self.df[['trackPhi','trackEta']].apply(lambda x: np.max(np.histogram2d(x['trackPhi'],x['trackEta'],
                                                                            bins=(self.phi_bins, self.eta_bins),
                                                                            range=[self.phi_range,self.eta_range])[0]),
                                                                  axis=1).max()
    
    #compute maximum of weights
    def compute_weight_max(self):
        '''compute the maximum over all event weight entries for rescaling data between 0 and 1. Take abs to be safe'''
        self.wmax=(self.df['weight'].abs()).apply(lambda x: np.max(x)).max()

## Curate file list

In [7]:
directory='/project/projectdirs/dasrepo/atlas_rpv_susy/hdf5/prod005_2017_01_11'
filelists=[parse_filename(x,directory) for x in os.listdir(directory) if x.endswith('h5')]
filenamedf=pd.DataFrame(filelists)

## Select signal configuration

In [8]:
trainselect=[{'mglu':1400, 'mneu': 850}]

In [9]:
#select signal configuration
sig_cfg_files=[]
for item in trainselect:
    sig_cfg_files+=list(filenamedf[ (filenamedf['mglu']==item['mglu']) & (filenamedf['mneu']==item['mneu']) ]['name'])

#select background configuration
jzmin=3
jzmax=11
bg_cfg_files=list(filenamedf[ (filenamedf['jz']>=jzmin) & (filenamedf['jz']<=jzmax) ]['name'])

In [10]:
#load additional signal files:
other_sig_cfg_files=list( filenamedf[ (filenamedf['mglu']>0.) | (filenamedf['mneu']>0.) ]['name'])
other_sig_cfg_files=[x for x in other_sig_cfg_files if x not in sig_cfg_files]

## Load data

In [11]:
#load background files
bgdf=load_data(bg_cfg_files)
#shuffle
np.random.seed(13)
bgdf=bgdf.reindex(np.random.permutation(bgdf.index))

In [12]:
#load signal data
sigdf=load_data(sig_cfg_files)
#shuffle
np.random.seed(13)
sigdf=sigdf.reindex(np.random.permutation(sigdf.index))

In [None]:
#load additional signal data
othersigdf=load_data(other_sig_cfg_files)
#shuffle
#np.random.seed(13)
#sigdf=sigdf.reindex(np.random.permutation(sigdf.index))

## Parameters

In [13]:
#parameters
train_fraction=0.75
validation_fraction=0.05
nbins=(224,224)
#nbins=(64,64)
nsig_augment=24

## Split Ensemble

In [34]:
#compute sizes:
#total
num_sig_total=sigdf.shape[0]
num_bg_total=bgdf.shape[0]

#group sigdf according to mGlu and mNeu:
siggroup=sigdf.groupby(['mGlu','mNeu'])

#training
#for signal, group according to masses and take the fraction for every theory:
trainsigdf=siggroup.apply(lambda x: x.iloc[:int(np.floor(x.shape[0]*train_fraction))])
trainsigdf.reset_index(drop=True,inplace=True)
num_sig_train=trainsigdf.shape[0]
num_bg_train=nsig_augment*num_sig_train

#validation
valsigdf=siggroup.apply(lambda x: x.iloc[int(np.floor(x.shape[0]*train_fraction))
                                        :int(np.floor(x.shape[0]*train_fraction))+int(np.floor(x.shape[0]*validation_fraction))])
valsigdf.reset_index(drop=True,inplace=True)
num_sig_validation=valsigdf.shape[0]
num_bg_validation=int(np.floor(bgdf.shape[0]*validation_fraction))

#test
testsigdf=siggroup.apply(lambda x: x.iloc[int(np.floor(x.shape[0]*train_fraction))+int(np.floor(x.shape[0]*validation_fraction)):])
testsigdf.reset_index(drop=True,inplace=True)
num_sig_test=testsigdf.shape[0]
num_bg_test=bgdf.iloc[num_bg_train+num_bg_validation:].shape[0]



#split the sets and rescale the weights
#training
traindf=pd.concat([bgdf.iloc[:num_bg_train],trainsigdf])
#rescale signal
traindf['weight'].ix[ traindf.label==1 ]/=( np.float(num_sig_train*nsig_augment)/np.float(num_sig_total) )
#rescale background
traindf['weight'].ix[ traindf.label==0 ]/= ( np.float(num_bg_train)/np.float(num_bg_total) )

#validation
validdf=pd.concat([bgdf.iloc[num_bg_train:num_bg_train+num_bg_validation],valsigdf])
#rescale signal
validdf['weight'].ix[ validdf.label==1 ]/=( np.float(num_sig_validation)/np.float(num_sig_total) )
#rescale background
validdf['weight'].ix[ validdf.label==0 ]/=( np.float(num_bg_validation)/np.float(num_bg_total) )

#test: first work on splitted sets and rescale those accordingly
testdf=pd.concat([bgdf.iloc[num_bg_train+num_bg_validation:],testsigdf])
#rescale signal
testdf['weight'].ix[ testdf.label==1 ]/=( np.float(num_sig_test)/np.float(num_sig_total) )
#rescale background
testdf['weight'].ix[ testdf.label==0 ]/=( np.float(num_bg_test)/np.float(num_bg_total) )
#now, append the rest of the test samples and sort the whole thing, because order does not matter here:
#testdf=pd.concat([testdf,othersigdf])
testdf.sort_values(by=["mGlu","mNeu"],inplace=True)
testdf.reset_index(drop=True,inplace=True)

#create iterators
hditer_train=hep_data_iterator(traindf,nbins=nbins,even_frequencies=False,augment=True,compute_max=True)
hditer_validation=hep_data_iterator(validdf,nbins=nbins,even_frequencies=False,compute_max=False)
hditer_test=hep_data_iterator(testdf,nbins=nbins,even_frequencies=False,compute_max=False)

#the preprocessing for the validation iterator has to be taken from the training iterator
#validation
hditer_validation.max_abs=hditer_train.max_abs
hditer_validation.wmax=hditer_train.wmax
#test
hditer_test.max_abs=hditer_train.max_abs
hditer_test.wmax=hditer_train.wmax

In [40]:
print "Max Weight: ",hditer_train.df['weight'].max()
print "Min Weight: ",hditer_train.df['weight'].min()
print "Median Weight: ",hditer_train.df['weight'].median()
print "Mean Weight: ",hditer_train.df['weight'].mean()
print "Sum Weight: ",hditer_train.df['weight'].sum()

Max Weight:  46786.3890602
Min Weight:  2.86535372051e-08
Median Weight:  0.0254081439478
Mean Weight:  17.3370874518
Sum Weight:  6046274.57462


## Preprocess Data

In [41]:
datadir="/global/cscratch1/sd/tkurth/atlas_dl/data_preselect_augmented"
numnodes=128

In [42]:
#print ensemble sizes and determine the chunk size
chunksize_train=int(np.ceil(2.*hditer_train.df.ix[ hditer_train.df.label==0 ].shape[0]/numnodes))
chunksize_train=int(np.floor(chunksize_train/(2*nsig_augment)))*2*nsig_augment
print "Training size: ",int(np.ceil(2.*hditer_train.df.ix[ hditer_train.df.label==0 ].shape[0])),' chunk size: ',chunksize_train
chunksize_validation=int(np.ceil(hditer_validation.num_examples/numnodes))
print "Validation size: ",hditer_validation.num_examples,' chunk size: ',chunksize_validation
chunksize_test=np.min([int(np.ceil(hditer_test.num_examples/numnodes)),60000])
print "Test size: ",hditer_test.num_examples,' chunk size: ',chunksize_test

Training size:  669598  chunk size:  5232
Validation size:  128630  chunk size:  1004
Test size:  2095260  chunk size:  16369


### Training

In [44]:
#here we have to treat background and signal separately
bgtrain=hditer_train.df.ix[ hditer_train.df.label==0 ]
sigtrain=hditer_train.df.ix[ hditer_train.df.label==1 ]
upper=int(np.floor(numnodes*chunksize_train/2))

for i in range(0,numnodes):
    
    #get background
    ilow=i*chunksize_train/2
    iup=np.min([(i+1)*chunksize_train/2,upper])
    #preprocess
    xbg,ybg,wbg,pbg,mgbg,mnbg=preprocess_data(bgtrain.iloc[ilow:iup], \
                        hditer_train.eta_range, \
                        hditer_train.phi_range, \
                        hditer_train.eta_bins, \
                        hditer_train.phi_bins)
    for c in range(3):
        xbg[:,c,:,:]/=hditer_train.max_abs[c]
    
    #get signal
    ilow=i*chunksize_train/(2*nsig_augment)
    iup=np.min([(i+1)*chunksize_train/(2*nsig_augment),upper])
    #preprocess
    xsg,ysg,wsg,psg,mgsg,mnsg=preprocess_data(sigtrain.iloc[ilow:iup], \
                        hditer_train.eta_range, \
                        hditer_train.phi_range, \
                        hditer_train.eta_bins, \
                        hditer_train.phi_bins)
    for c in range(3):
        xsg[:,c,:,:]/=hditer_train.max_abs[c]
    
    #tile the arrays
    xsg=np.tile(xsg,(nsig_augment,1,1,1))
    ysg=np.tile(ysg,(nsig_augment))
    wsg=np.tile(wsg,(nsig_augment))
    psg=np.tile(psg,(nsig_augment))
    mgsg=np.tile(mgsg,(nsig_augment))
    mnsg=np.tile(mnsg,(nsig_augment))
    #augment the x-values
    for k in range(0,xsg.shape[0]):
        xsg[k][0]=augment_data(xsg[k][0],int(np.round(hditer_train.phi_bins/8.)))
    
    #stack them together
    x=np.concatenate([xbg,xsg])
    y=np.concatenate([ybg,ysg])
    w=np.concatenate([wbg,wsg])
    p=np.concatenate([pbg,psg])
    mg=np.concatenate([mgbg,mgsg])
    mn=np.concatenate([mnbg,mnsg])
    
    #write file
    f = h5.File(datadir+'/hep_training_chunk'+str(i)+'.hdf5','w')
    f['data']=x
    f['label']=y
    f['weight']=w
    #normalize those weights for training
    f['normweight']=w/hditer_train.wmax
    f['psr']=p
    f['mg']=mg
    f['mn']=mn
    f.close()

### Test

In [48]:
#chunk it to fit it into memory
for idx,i in enumerate(range(0,hditer_test.num_examples,chunksize_test)):
    iup=np.min([i+chunksize_test,hditer_test.num_examples])
    
    #preprocess
    x,y,w,p,mg,mn=preprocess_data(hditer_test.df.iloc[i:iup], \
                        hditer_test.eta_range, \
                        hditer_test.phi_range, \
                        hditer_test.eta_bins, \
                        hditer_test.phi_bins)
    for c in range(3):
        x[:,c,:,:]/=hditer_train.max_abs[c]
    
    #write file
    f = h5.File(datadir+'/hep_test_chunk'+str(idx)+'.hdf5','w')
    f['data']=x
    f['label']=y
    f['weight']=w
    f['normweight']=w/hditer_train.wmax
    f['psr']=p
    f['mg']=mg
    f['mn']=mn
    f.close()

KeyboardInterrupt: 

### Validation

In [46]:
for idx,i in enumerate(range(0,hditer_validation.num_examples,chunksize_validation)):
    iup=np.min([i+chunksize_validation,hditer_validation.num_examples])
    
    #preprocess
    x,y,w,p,mg,mn=preprocess_data(hditer_validation.df.iloc[i:iup], \
                    hditer_validation.eta_range, \
                    hditer_validation.phi_range, \
                    hditer_validation.eta_bins, \
                    hditer_validation.phi_bins)
    for c in range(3):
        x[:,c,:,:]/=hditer_train.max_abs[c]
    
    #write the file
    f = h5.File(datadir+'/hep_validation_chunk'+str(idx)+'.hdf5','w')
    f['data']=x
    f['label']=y
    f['weight']=w
    #normalize those weights for validation to compare with training loss
    f['normweight']=w/hditer_train.wmax
    f['psr']=p
    f['mg']=mg
    f['mn']=mn
    f.close()