In [1]:
# Importing packages
import sys

sys.path.append('..')

import numpy as np
import numba as nb
import pandas as pd

from src.pychange.costs.normal import NormalMeanCost, NormalVarCost, NormalMeanVarCost
# from src.pychange.costs.poisson import PoissonMeanVarCost
# from src.pychange.costs.exp import ExponentialMeanVarCost
# from src.pychange.costs.gamma import GammaMeanVarCost
# from src.pychange.costs.emp import EmpiricalCost
# from src.pychange.seg.amoc import AmocSeg
# from src.pychange.seg.binseg import BinSeg
from src.pychange.seg.pelt import PeltSeg
from src.pychange.penalties import mbic_penalty
from src.pychange.seg.base import BaseSeg, seg_sig

from src.pychange.r import ROfflineChangepoint

Unable to determine R home: [WinError 2] The system cannot find the file specified
Unable to determine R library path: Command '('C:\\Program Files\\R\\R-4.1.1\\bin\\Rscript', '-e', 'cat(Sys.getenv("LD_LIBRARY_PATH"))')' returned non-zero exit status 1.


In [18]:
data = np.hstack([
    np.random.normal(0, 1, (1000,)),
    np.random.poisson(10, (1000,)),
    np.random.normal(20, 1, (1000,)),
    np.random.poisson(1, (1000,)),
    np.random.normal(4, 1, (1000,)),
])

In [3]:
# Importing packages
import math

import numpy as np
import numba as nb

from src.pychange.costs.base import BaseCost, preprocess_sig, cost_sig

In [26]:
@nb.njit(cost_sig, fastmath=True)
def empirical_cost(s, e, y, cost_args):
    cost = 0.0
    k = int(cost_args[0])
    for j in range(k):
        a_sum = y[e, j] - y[s, j]
        if a_sum != 0.0:
            n = e - s
            a_half = 0.5 * a_sum
            if a_half != n:
                f = a_half / n
                fi = 1.0 - f
                l = f * math.log(f) + fi * math.log(fi)
                cost += n * l
    cost *= cost_args[1]
    return cost


@nb.njit(cost_sig, fastmath=True)
def empirical_cost_mbic(s, e, y, cost_args):

    cost = empirical_cost(s, e, y, cost_args)
    return cost + math.log(e - s)


class EmpiricalCost(BaseCost):
    
    n_params = 1
    
    def __init__(self, *args, k: int=10, **kwargs):
        super().__init__(*args, **kwargs)
        self.args = np.float64([k])
    
    @staticmethod
    @nb.njit(preprocess_sig, fastmath=True)
    def preprocess(y, args):

        # Getting some needed scalars
        n = y.shape[0]
        k = int(args[0])
        yK = -1 + (2 * (np.arange(k) -1) / k)
        c = -1.0 * np.log(2 * n - 1)
        pK = 1 / (1 + np.exp(c * yK))

        # Initializing array to hold partial sum values
        sumstats = np.zeros(shape=(n + 1, k), dtype=np.float64)
        y_sort = np.sort(y)

        # Iterating over quantiles
        for i in range(k):
            j = int((n - 1) * pK[i] + 1)
            sumstats[1:, i] = np.cumsum(y < y_sort[j]) + 0.5 * np.cumsum(y == y_sort[j])
    
        return sumstats, np.float64([k, 2 * c / k])

    cost_fn = staticmethod(empirical_cost)
    cost_fn_mbic = staticmethod(empirical_cost_mbic)


In [20]:
%timeit PeltSeg(EmpiricalCost(k=10), mbic_penalty, min_len=10, max_cps=10).fit(data).predict()

254 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit ROfflineChangepoint('np', penalty='MBIC', method='PELT', nquantiles=10, minseglen=10).fit(data).predict()

1.38 s ± 6.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
PeltSeg(EmpiricalCost(k=10), mbic_penalty, min_len=10, max_cps=10).fit(data).predict()

array([1000, 2000, 3000, 4001], dtype=int64)

In [28]:
ROfflineChangepoint('np', penalty='MBIC', method='PELT', nquantiles=10, minseglen=10).fit(data).predict()

array([1000, 2000, 3000, 4000], dtype=int64)

In [5]:
import timeit

In [6]:
py_times = timeit.repeat('PeltSeg(NormalMeanVarCost(), bic_penalty, min_len=10, max_cps=10).fit(data).predict()', repeat=10, number=20, globals=globals())

In [7]:
r_times = timeit.repeat("ROfflineChangepoint('meanvar', penalty='BIC', method='PELT', test_stat='Normal', minseglen=10, Q=10).fit(data).predict().astype(np.int64)", repeat=10, number=20, globals=globals())

In [9]:
for i in ['Mean', 'Var', 'MeanVar']:

    

    py_times = timeit.repeat(f'PeltSeg(Normal{i}Cost(), bic_penalty, min_len=10, max_cps=10).fit(data).predict()', repeat=10, number=1, globals=globals())
    r_times = timeit.repeat(f"ROfflineChangepoint('{i.lower()}', penalty='BIC', method='PELT', test_stat='Normal', minseglen=10, Q=10).fit(data).predict().astype(np.int64)", repeat=10, number=20, globals=globals())

    py_mean = np.mean(py_times)
    r_mean = np.mean(r_times)
    print(i)
    print(py_mean)
    print(r_mean)
    print(r_mean / py_mean)

Mean
5.001938950000005
1.4876493899999956
0.2974145436141307


KeyboardInterrupt: 

In [6]:
%timeit PeltSeg(NormalMeanVarCost(), bic_penalty, min_len=10, max_cps=10).fit(data).predict()

64.4 ms ± 3.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
%timeit ROfflineChangepoint('meanvar', penalty='BIC', method='PELT', test_stat='Normal', minseglen=10, Q=10).fit(data).predict().astype(np.int64)

143 ms ± 8.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [3]:
data_name = 'wave_c44137'
data = pd.read_csv(f'../data/{data_name}.csv').iloc[:, -1].values

In [228]:
data = np.hstack([
    np.random.normal(0, 1, size=(200,)),
    np.random.normal(10, 2, size=(100,))
]).astype(np.float64)

In [14]:
cps1 = PeltSeg(NormalMeanVarCost(), bic_penalty, min_len=1000, max_cps=10).fit(data).predict()
cps2 = ROfflineChangepoint('meanvar', penalty='BIC', method='PELT', test_stat='Normal', minseglen=1000, Q=10).fit(data).predict().astype(np.int64)
print(len(cps1))
print(len(cps2))
if len(cps1) == len(cps2):
    print(np.all(cps1 == cps2))
print(cps1, cps2)

47
48
[ 2230  3506  4791  5793  6800  7962  9843 11567 12569 14415 15713 16897
 17952 19176 20198 21373 22558 23800 24928 25930 27403 29342 30949 32121
 33879 35145 36406 37746 38762 40040 41049 42873 43915 45433 46435 48166
 49468 50943 51952 52988 54008 55162 56507 57567 59563 61021 62023] [ 1236  2236  3506  4793  5793  6800  7962  9843 11569 12569 14415 15713
 16713 17952 19176 20198 21373 22558 23800 24928 25928 27403 29342 30949
 32121 33879 35145 36406 37746 38762 40040 41040 42873 43915 45435 46435
 48166 49468 50943 51952 52988 54008 55162 56507 57567 59563 61021 62021]


In [18]:
%timeit BinSeg(PoissonMeanVarCost(), bic_penalty, min_len=5, max_cps=100).fit(data).predict()

169 ms ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%timeit ROfflineChangepoint('meanvar', penalty='BIC', method='BinSeg', test_stat='Poisson', minseglen=5, Q=100).fit(data).predict()

247 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [1]:
# Importing packages
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import numba as nb

from src.pychange.costs.normal import NormalMeanCost
from src.pychange.r import ROfflineChangepoint

from src.pychange.seg.base import seg_sig

Unable to determine R home: [WinError 2] The system cannot find the file specified
Unable to determine R library path: Command '('C:\\Program Files\\R\\R-4.0.5\\bin\\Rscript', '-e', 'cat(Sys.getenv("LD_LIBRARY_PATH"))')' returned non-zero exit status 1.


In [2]:
data_name = 'wave_c44137'
data = pd.read_csv(f'../data/{data_name}.csv').iloc[:, -1].values

data = np.hstack([
    np.random.normal(0, 1, (200,)),
    np.random.normal(5, 1, (200,)),
    np.random.normal(10, 1, (200,)),
    np.random.normal(-2, 1, (200,)),
])

max_cps = 5
min_len = 5
n = data.shape[0]
cost = NormalMeanCost().fit(data)
pen = 10.0

In [3]:
ROfflineChangepoint('mean', penalty='Manual', method='BinSeg', test_stat='Normal', minseglen=5, pen_value=10.0, Q=5).fit(data).predict()

array([200., 400., 600.])

In [7]:
def seg_fn(cost, sumstats, cost_args, penalty, min_len, max_cps, n):

    # Initializing our array of changepoint candidates and costs
    tau = np.empty((max_cps + 2,), dtype=np.int64)
    tau[:2] = np.int64([0, n])
    costs = np.empty((n,), dtype=np.float64)
    
    # Initializing array of found changepoints and their associated costs
    cps = np.empty((max_cps,), dtype=np.int64)
    cps_costs = np.zeros((max_cps,), dtype=np.float64)
    
    # Iterating to a max depth of our max changepoint limit
    for q in range(max_cps):
        
        # Setting all costs back to zero for this run
        costs[:] = 0.0
        
        # Iterating over each of the current segments
        for ind in range(q + 1):
            
            # Setting start and end
            start = tau[ind] + 1
            end = tau[ind + 1]
            
            # Adjusting for min len
            start_ind = start + min_len
            end_ind = end - min_len + 1
            
            # Getting our null cost
            null = cost(start, end, sumstats, cost_args)
            costs[start_ind: end_ind] = null
            
            # Iterating over candidate points
            for j in range(start_ind, end_ind):
                costs[j] -= cost(start, j, sumstats, cost_args)
                costs[j] -= cost(j, end, sumstats, cost_args)
            
            # Halving vlaues
            costs[start_ind: end_ind] *= -0.5

        # Finding the best changepoint candidate from this run
        best_ind = np.argmin(costs)
        best_cost = costs[best_ind]
        cps[q] = best_ind
        
        # If better than the previous best cost, add to change point list
        if best_cost < cps_costs[q]:
            cps_costs[q] = best_cost
            
        # Adding changepoint to our list of endpoints
        tau[q + 2] = best_ind
        tau[: q + 3] = np.sort(tau[: q + 3])

    # Pruning changepoints by penalty
    n_cps = np.sum(cps_costs <= -1 * penalty)
    if n_cps == 0:
        cps = np.int64([-1])
    else:
        cps = np.sort(cps[:n_cps])
    return cps

In [8]:
seg_fn(cost.cost_fn, cost.sumstats, cost.cost_args, pen, min_len, max_cps, n)

array([200, 400, 600], dtype=int64)

In [16]:
%timeit binary_segment(cost.cost_fn, cost.sumstats, cost.cost_args, pen, min_len, max_cps, n)

270 µs ± 7.81 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
%timeit ROfflineChangepoint('mean', penalty='Manual', method='BinSeg', test_stat='Normal', minseglen=5, pen_value=10.0, Q=5).fit(data).predict()

1.76 ms ± 369 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
