## Jupyter notebook illustrating routines for processing HDP-SLDS segmented trajectory data for MLE on segments  
### create segmented data via something like "pullCPdataAndSave2019.m" after HDP-SLDS run on rull trajectory) 

Copyright 2018 Ursa Analytics, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

 

In [4]:
%matplotlib inline 
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import numpy as np
import commands,glob2,os
import re,h5py

In [2]:
#define aux functions
def rle(inarray):
    """ run length encoding.  
        Multi datatype arrays catered for including non Numpy
        returns: tuple (runlengths, startpositions, values) 
        
        Sample call
        >>> length,pos,val = rle([True, True, True, False, True, False, False])
        Out: length <- array([3, 1, 1, 2])
             pos    <- array([0, 3, 4, 5])
             val    <- array([ True, False,  True, False], dtype=bool)
             
        #rle helps in getting stats on sequences vs. just finding locations of changepoints
        #            can be used with arbitrary category labels
        #dumbList = ["foo", "bar", "bar","blah","blah","blah","bleh"]
        #rle(dumbList)
    """
    ia = np.asarray(inarray)                  # force numpy
    n = len(ia)
    if n == 0: 
        return (None, None, None)
    else:
        y = np.array(ia[1:] != ia[:-1])     # pairwise unequal (string safe)
        i = np.append(np.where(y), n - 1)   # must include last element posi
        z = np.diff(np.append(-1, i))       # run lengths
        p = np.cumsum(np.append(0, z))[:-1] # positions
        return(z, p, ia[i])                 # last element contains type of data in run

def loadh5(floc,dsetnameIndex=0,printFile=False,printDsets=False): #load a numeric hdf5 file. "floc" is the path of ASCII file to read.


    if floc[0]=='~':  #replace standard unix home shortcut with explicit path
        floc='/Users/calderoc' + floc[1:]
    f=h5py.File(floc,'r')
    dsetNames = f.keys()
    if printFile:
        print 'Read file: ' + floc
        print 'Pulling Dataset index', dsetnameIndex, 'Named: ' + dsetNames[dsetnameIndex]
    if printDsets:
        if printFile:
            print 'Displaying top level dsets in hdf5 file since printDset set to True'
            print 'Found N= ',len(f.keys()),'top level dsets (returning integer vs. any data since printDset=True)'
            for i in f.keys():
                print i
        return len(f.keys())
    else:
        if type(dsetnameIndex) is list: #read a batch without reading dset keys each time (costly if big data file)
            g=[]
            [g.append(f[dsetNames[dsetnameIndexi]].value) for dsetnameIndexi in dsetnameIndex]
            
        else:
            dsetname=dsetNames[dsetnameIndex]
            g= f[dsetname]  #default name for h5import dataset when no arg given (so i use this in my scripts as well)
            g= np.array(g) #return numpy array (get shape with g.shape)
    f.close()
    return g  

def wouth5(floc,DATAMAT,dset='dataset0'): #write a simple hdf5 file into "floc";  assumes numpy array passed as DATAMAT.  handling other datatypes with h5py is fairly easy
    if floc[0]=='~':  #replace standard unix home shortcut with explicit path
                floc='/Users/calderoc' + floc[1:] #purdue login: calderoc / princeton: ccaldero; since grad school i use either calderoc or ccalderoN as unix logins (keep symlink to both home folders in *nix style systems)
    f=h5py.File(floc,'a')#default name for h5import dataset when no arg given (so i use this in my scripts as well)
    dset = f.create_dataset(dset, data=DATAMAT)
    f.close()
    return 0

In [11]:


datadir='./*.h5' # used to run in output folder of pullCPdataAndSave.m  now make more general
segmentFolder = '../MaySegmentation/output'
datadir= segmentFolder + '/*.h5' # 
output = glob2.glob(datadir)
output=sorted(output, key=lambda x:float(re.findall("(\d+)",x)[0]))
h5outname = segmentFolder + 'cp2020.h5'
sizeCut = 70 #only output trajectories longer than this parameter for MLE analysis

print output
traj=0
for i in output:

    X=loadh5(i)
    traj+=1
    length,pos,val = rle(X[:,-1])
    nsegsi=0
    for tx,leni in enumerate(length):
        if leni>sizeCut:
            segi=X[pos[tx]:pos[tx]+leni,0:-1] #truncate off the label info since routine expects frame,time,x,y info
            nsegsi+=1
            tagi=str(traj)+'_'+str(nsegsi)
            print('processing',i,tagi)
            wouth5(h5outname,segi,dset=tagi)
            
        
    

['../MaySegmentation/output/1.h5', '../MaySegmentation/output/2.h5', '../MaySegmentation/output/3.h5', '../MaySegmentation/output/4.h5', '../MaySegmentation/output/5.h5', '../MaySegmentation/output/6.h5', '../MaySegmentation/output/7.h5', '../MaySegmentation/output/8.h5', '../MaySegmentation/output/9.h5', '../MaySegmentation/output/10.h5', '../MaySegmentation/output/11.h5', '../MaySegmentation/output/12.h5', '../MaySegmentation/output/13.h5', '../MaySegmentation/output/14.h5', '../MaySegmentation/output/15.h5', '../MaySegmentation/output/16.h5', '../MaySegmentation/output/17.h5', '../MaySegmentation/output/18.h5', '../MaySegmentation/output/19.h5', '../MaySegmentation/output/20.h5', '../MaySegmentation/output/21.h5', '../MaySegmentation/output/22.h5', '../MaySegmentation/output/23.h5', '../MaySegmentation/output/24.h5', '../MaySegmentation/output/25.h5', '../MaySegmentation/output/26.h5', '../MaySegmentation/output/27.h5', '../MaySegmentation/output/28.h5', '../MaySegmentation/output/2