In [1]:
%load_ext Cython

In [238]:
%%cython
from libc.math cimport abs, sqrt, ceil, floor, log, log2, acos, cos
from numpy cimport ndarray
import numpy as np
from numpy import ones, zeros, int32, float32, uint8, fromstring
from numpy import sort, empty, array, arange, concatenate, searchsorted
from numpy import minimum, maximum,divide, std, mean
from numpy import min as nmin
from numpy import max as nmax
import h5py as h

def binrvps_constantbins(ndarray[int, ndim=2] regions,
                    ndarray[double, ndim=1] values,
                    int binSize=1000,
                    int start=0,
                    int end=-1,
                    double dataMax=0.0,
                    double scale=1.0,
                    double threshold=0.0):
    
    '''
    Given some (N,2) array where each row is some interval [a,b] with a<b and given some values over each
    of those intervals, bin that data track into a length K vector of regular intervals each of some given 
    bin size. 
    
    Arguments:
    
    - regions: (M,2) shape array where each row details a region.
    - values: (M,) shape array detailing the data track value for each of the given regions.
    - binSize: Integer. The size of each regular bin.
    - start: The minimum possible bin. All regions [a,b] with b < start are excluded from the binning. Regions
             with a < start < b are clipped - the proportion of the bin overlapping with the allowed interval
             defined by [start,end] is multiplied by the value of the original region. 
    - end: The maximum possible bin. All regions [a,b] with end < a are excluded from the binning. Regions
           with a < end < b are clipped - the proportion of the bin overlapping with the allowed interval
           defined by [start,end] is multiplied by the value of the original region.
    - dataMax: If dataMax is set to some non-zero value then dataMax is used as a normalising constant for the
               binned data track. That is, after initial binning the maximum and minimum data values across
               the bins are computed. If dataMax is greater than the maximum absolute data track value then
               this is used as a normalising constant i.e. all binned values are divided by dataMax. If the 
               maximum absolute binned value is greater than dataMax then this is used as the normalising 
               constant instead.
    - scale: Factor by which to scale all data by after binning.
    - threshold: Minimum region-value to use for binning. If a given region has value < threshold then this
                 region is excluded from the binning. 
                 
    Returns:
    
    - hist: A histogram of the data (the binned data track) with each histogram bin of size binSize. This will
            be a ((end-start)/binSize,) shape array
    '''

    cdef int i, p1, p2, b1, b2, b3, s, e
    cdef int nBins, n = len(values)
    cdef double f, r, v, vMin, vMax

    if len(regions) != n:
        data = (len(regions), n)
        raise Exception('Number of regions (%d) does not match number of values (%d)' % data)

    if end < 0:
        end = binSize * int32(regions.max() / binSize)

    e = int32((end-start-1)/binSize)
    #Number of bins Must be at least one if e-s < binSize
    nBins = e+1
    print("##################################")
    print("##################################")
    print("Number of bins: {}".format(nBins))
    print("Bins (inclusive) = {}".format(["{}-{}".format(start + binSize*i, start + binSize*(i+1) -1) for i in range(nBins-1)] + ["{}-{}".format(start + binSize*(nBins-1), end-1)]))

    cdef ndarray[double, ndim=1] hist = zeros(nBins, float)

    for i in range(n):
        v = values[i]
        if abs(v) < threshold:
            continue

        if regions[i,0] > regions[i,1]:
            p1 = regions[i,1]
            p2 = regions[i,0]

        else:
            p1 = regions[i,0]
            p2 = regions[i,1]
        
        if p1 == p2:
            continue
        
        print("##################################")
        print("Region: [{},{})".format(p1,p2))
        
        #If the start of the pair is greater than or equal to our
        #end then we dont care about it
        if end <= p1:
            continue

        if start > p2:
            continue
            
        if p1 < start:
            p1 = start
    
        if p2 >= end:
            p2 = end

        print("Clipped Region: [{},{})".format(p1,p2))
        b1 = int32((p1-start)/binSize)
        b2 = int32((p2-start-1)/binSize)
    
        print("Clipped Region bins: {}-{} inclusive".format(b1,b2))
        if b1 == b2:
            r = <double> (p2-p1)
            hist[b1] += v*r
            print("************")
            print("bin={} ({}-{} inclusive)".format(b1,start + (b1)*binSize,
                                                    start + (b1+1)*binSize - 1,
                                                    p1,
                                                    p2))
            
            print("Overlaps = {}\n".format([i for i in range(p1,p2)]))
        else:
            for b3 in range(b1, b2+1):
                if b3 > nBins-1:
                    break

                if b3 * binSize < p1 - start:
                    f = <double> (start+(b3+1)*binSize - p1)
                    print("************")
                    print("bin={} ({}-{} inclusive)\nregion = [{},{})\nf={}".format(b3,
                                                                                    start + (b3)*binSize,
                                                                                    start + (b3+1)*binSize - 1,
                                                                                    p1,
                                                                                    p2,
                                                                                    f))
                    print("Overlaps = {}\n".format([p1+i for i in range(<int> f)]))

                elif (b3+1) * binSize > p2:
                    f = <double> (p2 - b3*binSize - start) 
                    print("************")
                    print("bin={} ({}-{} inclusive)\nregion = [{},{})\nf={}\n".format(b3,
                                                                                      start + (b3)*binSize,
                                                                                      start + (b3+1)*binSize - 1,
                                                                                      p1,
                                                                                      p2,
                                                                                      f))
                    print("Overlaps = {}\n".format([start+(b3)*binSize+i for i in range(<int>f)]))
                else:
                    f = <double> binSize
                    print("************")
                    print("bin={} ({}-{} inclusive)\nregion = ({},{})\nf={}\n".format(b3,
                                                                                      start + (b3)*binSize,
                                                                                      start + (b3+1)*binSize - 1,
                                                                                      p1,
                                                                                      p2,
                                                                                      f))
                    print("Overlaps = {}\n".format([start+(b3)*binSize+i for i in range(<int>f)]))
                hist[b3] += v * f

    hist /= binSize
    if nBins > 1:
        hist[nBins-1] *= binSize/(end - start - binSize*(nBins-1))
    else:
        hist[0] *= binSize/(end-start)
    
    if dataMax != 0.0:
        vMin = hist[0]
        vMax = hist[0]

        for i in range(1, nBins):
            if hist[i] < vMin:
                vMin = hist[i]

            elif hist[i] > vMax:
                vMax = hist[i]

        vMax = max(abs(vMin), vMax, dataMax)

        if vMax > 0.0:
            for i in range(0, nBins):
                hist[i] = hist[i]/vMax

    for i in range(0, nBins):
        hist[i] = hist[i] * scale

    return hist

In [239]:
regions = np.array([[2,7],
                    [10,22],
                    [7,17],
                    [21,100]]).astype('int32')
values = np.array([1,1,1,1]).astype('double')

In [241]:
binrvps_constantbins(regions, values, binSize = 5, start = 0, end = 57)

##################################
##################################
Number of bins: 12
Bins (inclusive) = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-56']
##################################
Region: [2,7)
Clipped Region: [2,7)
Clipped Region bins: 0-1 inclusive
************
bin=0 (0-4 inclusive)
region = [2,7)
f=3.0
Overlaps = [2, 3, 4]

************
bin=1 (5-9 inclusive)
region = [2,7)
f=2.0

Overlaps = [5, 6]

##################################
Region: [10,22)
Clipped Region: [10,22)
Clipped Region bins: 2-4 inclusive
************
bin=2 (10-14 inclusive)
region = (10,22)
f=5.0

Overlaps = [10, 11, 12, 13, 14]

************
bin=3 (15-19 inclusive)
region = (10,22)
f=5.0

Overlaps = [15, 16, 17, 18, 19]

************
bin=4 (20-24 inclusive)
region = [10,22)
f=2.0

Overlaps = [20, 21]

##################################
Region: [7,17)
Clipped Region: [7,17)
Clipped Region bins: 1-3 inclusive
************
bin=1 (5-9 inclusive)
regi

array([0.6, 1. , 2. , 1.4, 1.2, 1. , 1. , 1. , 1. , 1. , 1. , 1. ])

In [237]:
1,2,3,4,5
6,7,8,9,10
11

11