# Test on ```rpy2```

In [None]:
import os
import time

import numpy as np
import json

import utils as my_ut
import result_related as my_res

from math import floor
from datetime import datetime
from typing import List
from tqdm import tqdm
from sklearn.covariance import log_likelihood

## ```rpy2``` utils

In [None]:
import rpy2
import rpy2.robjects as robjects

## To aid in printing HTML in notebooks
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

In [None]:
from rpy2.robjects.packages import importr

r_base = importr('base')
r_utils = importr('utils')
r_cp = importr('changepoints')
r_covcp = importr('covcp')
r_mass = importr('MASS')
r_domc = importr('doMC')
r_parallel = importr('parallel')
r_RcppCNPy = importr('RcppCNPy')
r_reticulate = importr('reticulate')
r_glasso = importr('glasso')

rprint = robjects.globalenv.find("print")

In [None]:
def get_nested_named_element_from_R_list(obj, named_elem:str):
    nested_names_list = named_elem.split('$')
    target_obj = obj
    for name in nested_names_list:
        if name.isdigit():
            target_obj = target_obj.rx2(int(name))
        else:
            target_obj = target_obj.rx2(name)
    return target_obj

In [None]:
def turn_r_bool_in_py_bool(r_bool):
    str_r_bool = str(r_bool.rx2(1))[4:9]
    return not str_r_bool == 'FALSE'

## Sandbox on CPD algorithms

### Experiments with ```glasso```

Je peux bien travailler avec la fonction loglikelihood de scikit learn, quitte à la ré-implémenter de mon côté (bien faire attention à ce que les signes soient les bons, qu'on ne maximise pas à l'envers). Ce qu'il reste donc à faire c'est de faire les calculs de matrices de précision avec la librairie R.

In [None]:
some_signal_path = 'data_1/.temp/left_sub_signal.npy'
r_signal = r_RcppCNPy.npyLoad(some_signal_path)
print(r_base.dim(r_signal))

In [None]:
pen_coef = r_covcp.chooseRho(r_signal)
emp_cov = r_covcp.Cov(r_signal)
glasso_invcov_mat = r_glasso.glasso(s=emp_cov, rho=pen_coef).rx2('wi')

In [None]:
def glasso_cost_func(start, end, signal, pen_mult_coef):
    # extracting the target subsignal in R
    sub_signal = signal[start:end, :]
    np.save("data_1/.temp/glasso_sub_signal_1.npy", sub_signal)
    r_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/glasso_sub_signal_1.npy")
    # applying the R implementation of Graphical Lasso
    raw_pen_coef = r_covcp.chooseRho(r_subsignal)
    pen_coef = r_base.c(pen_mult_coef * raw_pen_coef[0])
    r_emp_cov = r_covcp.Cov(r_subsignal)
    r_glasso_obj = r_glasso.glasso(s=r_emp_cov, rho=pen_coef, maxit=10)
    r_glasso_preci_mat = r_glasso_obj.rx2('wi')
    # computing the cost function
    glasso_preci_mat = np.asarray(r_glasso_preci_mat)
    emp_cov = np.asarray(r_emp_cov)
    cost_func_val = - log_likelihood(emp_cov, glasso_preci_mat)
    return cost_func_val

In [1]:
def rglasso_cpd_dynprog(n_bkps:int, min_size:int, signal, pen_mult_coef):
    # path_mat[n, K] avec n --> [y_0, ... y_{n-1}] (very important to understand indexing) , K : n_bkps
    # sum_of_cost_mat[n, K]: best cost for signal until sample n with K bkps

    # initialization 
    n_samples = signal.shape[0]
    path_mat = np.empty((n_samples+1, n_bkps+1), dtype=np.int32)
    path_mat[:, 0] = 0
    path_mat[0, :] = -1
    sum_of_cost_mat = np.full((n_samples+1, n_bkps+1),  fill_value=np.inf, dtype=np.float64)
    sum_of_cost_mat[0, :] = 0

    # pre-computation, to optimize jit processing
    statio_segment_cost = np.full((n_samples+1, n_samples+1), fill_value=np.inf, dtype=np.float64)
    for start in tqdm(range(0, n_samples-min_size+1), desc='Looping over the segments start'):
        for end in range(start+min_size, n_samples+1):
            statio_segment_cost[start, end] = glasso_cost_func(start, end, signal, pen_mult_coef)

    # forward computation
    for end in range(min_size, n_samples+1):
        sum_of_cost_mat[end, 0] = statio_segment_cost[0, end]
        # consistent because our cost functions compute the costs over [start, end[
        max_admissible_n_bkp = floor(end/min_size) - 1
        for k_bkps in range(1, min(max_admissible_n_bkp+1, n_bkps+1)):
            soc_optim = np.inf
            soc_argmin = -1
            for mid in range(min_size*k_bkps, end - min_size + 1):
                soc = sum_of_cost_mat[mid, k_bkps-1] + statio_segment_cost[mid, end]
                if soc < soc_optim:
                    soc_argmin = mid
                    soc_optim = soc
            sum_of_cost_mat[end, k_bkps] = soc_optim
            path_mat[end, k_bkps] = soc_argmin

    # backtracking
    bkps = np.full((n_bkps+1), fill_value=n_samples)
    for k_bkps in range(n_bkps, 0, -1):
        bkps[k_bkps-1] = path_mat[bkps[k_bkps], k_bkps]
    
    return bkps

In [None]:
def run_r_glasso_cpd_algo(signal, gt_bkps: List[int], rglasso_results: dict, exp_id: int, pen_mult_coef: float):
    # running CPD algorithm
    t1 = time.perf_counter()
    glasso_bkps = rglasso_cpd_dynprog(n_bkps=len(gt_bkps)-1, min_size=signal.shape[1], signal=signal, pen_mult_coef=pen_mult_coef)
    t2 = time.perf_counter()
    glasso_bkps.sort()
    glasso_bkps = [int(bkp) for bkp in glasso_bkps]
    # logging
    rglasso_results[exp_id] = {}
    rglasso_results[exp_id]["time"] = round(t2 - t1, ndigits=3)
    rglasso_results[exp_id]["pred"] = glasso_bkps
    rglasso_results[exp_id]["gt"] = gt_bkps
    rglasso_results[exp_id]["n_bkps"] = len(gt_bkps)-1

In [None]:
def run_r_glasso_cpd_algo_and_store(signal, gt_bkps: List[int], json_path: str, exp_id: str, pen_mult_coef: float):
    # running CPD algorithm
    t1 = time.perf_counter()
    glasso_bkps = rglasso_cpd_dynprog(n_bkps=len(gt_bkps)-1, min_size=signal.shape[1], signal=signal, pen_mult_coef=pen_mult_coef)
    t2 = time.perf_counter()
    glasso_bkps.sort()
    glasso_bkps = [int(bkp) for bkp in glasso_bkps]
    # logging
    res = {}
    res["time"] = round(t2 - t1, ndigits=3)
    res["pred"] = glasso_bkps
    res["gt"] = gt_bkps
    res["n_bkps"] = len(gt_bkps)-1
    my_ut.load_and_write_json(json_path, exp_id, my_ut.turn_all_list_of_dict_into_str(res), indent=4)

In [None]:
### SYNTHETIC DATA ADAPTED
##########################

NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
GRAPH_PATH = "data_1/synthetic_data/graphs/clean_ER_with_bandwidth"
GRAPH_NAME =  NAME  #+ f"_edge_prop_{EDGE_PROP}" #"exp_geo_20_nodes_av_deg_10" #"ER_20_nodes_deg_10_bandwidth_0.4_edge_prop_0.05" 
SIGNAL_PATH = "data_1/synthetic_data/signal/within_hyp/SNR_20_graph_lasso_calibration" #"data_1/signal/within_hyp/SNR_20_varying_segment_length"
MAX_ID_SUBSET = 12
RESULT_DIR = "results_1/synthetic/within_hypothesis_noisy/glasso_experiments/SNR_20_large_x0.4_1000_samples"
LASSO_ALPHA = "from_covcp_paper"
MIN_SEGMENT_LENGTH_COEF = 0.4
SNR = 20
BKPS_GAP_CONSTRAINT = "large"
N_BKPS = 4
# NB_BREAKDOWN = 6
# BREAKDOWN_LENGTH_LIST = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
# SNR_LIST = [-5, 5, 15]
# SEG_LENGTH_COEF_LIST = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# EDGE_PROP_MODIF_LIST = [0.03, 0.05, 0.08, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
OTHER_GRAPH_SEED = 1
PEN_MULT_COEF_LIST = [2, 4]

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# logging
graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
seg_length_hyp = "minimal"
graph_rng = np.random.default_rng(OTHER_GRAPH_SEED)

for PEN_COEF in PEN_MULT_COEF_LIST:

    print(f"\n\n \t Working with multiplicative coefficient: {PEN_COEF} \n")
    
    # SIGNAL_NAME = f"{BKPS_GAP_CONSTRAINT}_x{MIN_SEGMENT_LENGTH_COEF}_SNR_{round(SNR, 4)}" + "_" + GRAPH_NAME 
    SIGNAL_NAME = f"{N_BKPS}_bkps_{BKPS_GAP_CONSTRAINT}_x{MIN_SEGMENT_LENGTH_COEF}_SNR_{round(SNR, 4)}_1000_samples" + "_" + GRAPH_NAME     #  _NBbd_{NB_BREAKDOWN}_bklength_{BREAKDOWN_LENGTH}"    #
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    
    RESULT_NAME = f"pen_coef_gridsearch_{LASSO_ALPHA}"
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    exp_desc = "Test of the graphical Lasso cost function: grid seach over the L1 penalty coefficient. Its value is fixed according to the fromula from the covcp paper and a multiplicative coefficient."
    experiment_metadata = {"datetime": now, "description": exp_desc, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": graph_path, "graph metadata": graph_metadata, "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME, "signal metadata": signal_metadata, "min segment length hypothesis": seg_length_hyp, "max id experiment subset": MAX_ID_SUBSET, "lasso penalty coefficient": LASSO_ALPHA}

    # output formatting
    # statio_results = {}
    # normal_results = {}
    lasso_results = {}
    name = f"lasso_pred_pencoefmult_{PEN_COEF}.json"
    json_path = os.path.join(results_dir, name)
    my_ut.create_parent_and_dump_json(results_dir, name, my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)

    # running CPD algorithms
    # for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
    for exp_id in range(MAX_ID_SUBSET):
        exp_id = str(exp_id)
        G, signal, gt_bkps, min_size = my_ut.load_data(graph_path, signal_path, exp_id, seg_length_hyp)
        exp_res = run_r_glasso_cpd_algo_and_store(signal, gt_bkps, json_path, exp_id, PEN_COEF)

        # run_r_glasso_cpd_algo(signal, gt_bkps, lasso_results, exp_id, PEN_COEF)

    my_ut.create_parent_and_dump_json(results_dir, F"lasso_pred_pencoefmult_{PEN_COEF}.json", my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)
my_ut.create_parent_and_dump_json(results_dir, "experiment_metadata.json", my_ut.turn_all_list_of_dict_into_str(experiment_metadata), indent=4)


In [None]:
### REAL DATA ADAPTED
##########################

# NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
GRAPH_PATH = "data/real_data/test_filtered_0.5-40_order_3_subsampled_8"
GRAPH_NAME =  "KNN_4_64_ch_graph_mat_adj_order_signal_header"
SIGNAL_PATH = "data/real_data/test_filtered_0.5-40_order_3_subsampled_8" 
RESULT_DIR = "results_1/real_data/test/filtered_0.5-40_order_3_subsampled_8"
LASSO_ALPHA = "from_covcp_paper"
seg_length_hyp = "minimal"

# GRAPH LASSO METHOD
PEN_MULT_COEF_LIST = [4]

# R COVCP METHOD
R_COVCP_SEED = 42
NB_CORES = 2
LEVEL_ALPHA = 0.3
STABLE_SET_LENGTH = 80
WINDOWS_SIZE = [80]


now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# logging
graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
# graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
# graph_rng = np.random.default_rng(OTHER_GRAPH_SEED)

for PEN_COEF in PEN_MULT_COEF_LIST:

    print(f"\n\n \t Working with multiplicative coefficient: {PEN_COEF} \n")
    
    # SIGNAL_NAME = f"{BKPS_GAP_CONSTRAINT}_x{MIN_SEGMENT_LENGTH_COEF}_SNR_{round(SNR, 4)}" + "_" + GRAPH_NAME 
    SIGNAL_NAME = "volunteerS007_exp04"
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    # signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    
    RESULT_NAME = f"running_time_test_with_mult_coef_{PEN_COEF}_{LASSO_ALPHA}"
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    exp_desc = "Test of the graphical Lasso cost function."
    experiment_metadata = {"datetime": now, "description": exp_desc, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": "no graph", "graph metadata": "no graph", "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME}

    # output formatting
    # statio_results = {}
    # normal_results = {}
    lasso_results = {}
    covcp_results = {}

    # running CPD algorithms
    # for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
    # G, signal, gt_bkps, min_size = my_ut.load_data(graph_path, signal_path, exp_id, seg_length_hyp)
    # signal = np.load(f"{signal_path}_signal.npy", allow_pickle=False).T
    # bkps = my_ut.open_json(f"{signal_path}_bkps.json")
    # run_r_glasso_cpd_algo(signal, gt_bkps, lasso_results, exp_id, PEN_COEF)
    r_signal = r_base.t(r_RcppCNPy.npyLoad(signal_path + '_signal.npy'))
    gt_bkps = my_ut.open_json(signal_path + '_bkps.json')
    min_size = 64
    run_r_covcp_algo(r_signal, gt_bkps, covcp_results, STABLE_SET_LENGTH, min_size, WINDOWS_SIZE, LEVEL_ALPHA, 0)

    my_ut.create_parent_and_dump_json(results_dir, F"r_covcp_level_{LEVEL_ALPHA}_stableset_{STABLE_SET_LENGTH}_adapted_windowsize_{WINDOWS_SIZE[0]}_pred.json", my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)
    # my_ut.create_parent_and_dump_json(results_dir, F"lasso_pred_pencoefmult_{PEN_COEF}.json", my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)

# my_ut.create_parent_and_dump_json(results_dir, "experiment_metadata.json", my_ut.turn_all_list_of_dict_into_str(experiment_metadata), indent=4)

In [None]:
### SYNTHETIC DATA ADAPTED
###########################

PRECI_RECALL_MARGIN = 5
res_folder_root = "results_1/synthetic/within_hypothesis_noisy/glasso_experiments/SNR_20_large_x0.4_1000_samples/4_bkps_large_x0.4_SNR_20_1000_samples_ER_20_nodes_deg_10_bandwidth_0.4_pen_coef_gridsearch_from_covcp_paper"
file_names = os.listdir(res_folder_root)
# PRED_FOLDER = [os.path.join(res_folder_root, file_name) for file_name in file_names]  #'.txt' not in file_name]
PRED_FOLDER = [res_folder_root]

# pencoef_mult_list =  [1, 2, 4]
# pencoef_mult_list =  [0.6, 0.8, 1.0, 1.2, 1.4, 6, 8, 10, 12, 14, 20, 30]
# pencoef_mult_list =  [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 25]
# pencoef_mult_list =  [0.001, 0.01, 0.1, 1, 10, 100]
pencoef_mult_list =  [0.001, 0.5, 1, 5, 10, 20, 100]

for pred_dir in PRED_FOLDER:

    # fetching predictions
    data_stats = my_ut.open_json(f"{pred_dir}/experiment_metadata.json")

    # output formatting
    metrics_dic = {}
    metrics_dic["pred_path"] = pred_dir
    metrics_dic["hyper-parameters"] = data_stats
    metrics_dic["hyper-parameters"]["metrics_margin"] = PRECI_RECALL_MARGIN
    full_results = {}

    for pencoefmult in pencoef_mult_list:

        lasso_pencoef_pred_dic = my_ut.open_json(f"{pred_dir}/lasso_pred_pencoefmult_{pencoefmult}.json")
        pencoef_glasso_results = {"recall": {'raw': []}, "precision": {'raw': []}, "f1_score": {'raw': []}, "hausdorff": {'raw': []}, "assignement_cost": {'raw': []}, "time": {"raw": []}}

        for exp_id in lasso_pencoef_pred_dic.keys():
            # compute metrics
            lasso_pred_bkps = my_ut.turn_str_of_list_into_list_of_int(lasso_pencoef_pred_dic[exp_id]["pred"])
            gt_bkps = my_ut.turn_str_of_list_into_list_of_int(lasso_pencoef_pred_dic[exp_id]["gt"])
            my_res.compute_and_update_metrics(gt_bkps, lasso_pred_bkps, pencoef_glasso_results, PRECI_RECALL_MARGIN)
            my_res.compute_assignement_cost(gt_bkps, lasso_pred_bkps, pencoef_glasso_results)
            pencoef_glasso_results["time"]["raw"].append(lasso_pencoef_pred_dic[exp_id]["time"])

        full_results[pencoefmult] = pencoef_glasso_results
        
    # results post-precessing and saving
    full_results = my_res.compute_and_add_stat_on_metrics(full_results)
    full_results["metadata"] = metrics_dic
    full_results = my_ut.turn_all_list_of_dict_into_str(full_results)
    my_ut.create_parent_and_dump_json(pred_dir, f'metrics_{PRECI_RECALL_MARGIN}.json', full_results, indent=4)

### Experiments with ```akopich/covcp```

Comprehénsion de la méthode : 

- un cpd est déclaré si pour une des taille de fenêtres on atteint, pour un des points centraux (de la fenêtre) une valeur de la statistique supérieure à la valeur critique (le seuil) calculé pour cette taille de fenêtre. 
- a priori les valeurs critiques sont calculées grâce au bootstrap ('data-driven approach')
- pour confirmer mon intuition sur la compréhension de la méthode :
    - aller voir dans la résolution de l'algo si je trouve d'où viennent les "critical values" pour confirmer qu'elles sont bien calculées grâce au bootstrap
    - a priori dans mon expérience on ne détecte le cp que pour la plus grande taillede fenêtre (parce que meilleure estimation ?)
        - aller voir comment évoluent les valeurs de la critical value en fonction de la taille de la fenêtre
        --> **les valeurs critiques diminuent systématiquement avec la taille des fenêtres considérées**
        - éventuellement faire des plot pour assess tout ça, et comprendre un peu mieux grâce à l'article comment fonctionne le bootstrap
    - concernant le niveau alpha :
        - comment influence les résultats : revenir à la définition pour voir si cohérent
        - voir si intervient bien dans le bootstrap au moment du calcul des valeurs critiques : censé les diminuer ou augmenter
    - comment influence la taille de l'échantillon 'stable' pour le bootstrap ?
        --> **jusque là, les expériences réalisées montrent que lorsqu'on diminue la taille du stableset, on diminue aussi les valeurs critiques**
    - comment influence la taille des segments de stationarité ?

A faire en code : 

- transformer les données python en données R (voir doc rpy2 sur le paragraphe associé)
- implémenter la détection de plusieurs ruptures (relire ce qu'ils disent dans l'article à ce sujet)

In [None]:
def get_covcp_localization(covcp_results, window_sizes):
    # check if there is actually a cp to localize
    is_there_cp = r_covcp.isRejected(covcp_results)
    if not is_there_cp:
        return None
    # if so, return the smallest window in which the cp can be localized
    ciritical_values = covcp_results.rx2('criticalValue')
    for i, window_size in enumerate(window_sizes):
        window_stat_results = get_nested_named_element_from_R_list(covcp_results, f'statistics$window2statistics${i+1}')
        max_stat = window_stat_results.rx2('statistics')[0]
        if max_stat > ciritical_values[i]:
            central_points = window_stat_results.rx2('centralPoints')
            stat_values_arr = np.asarray(window_stat_results.rx2('distances'))
            stats_argmx = int(np.argmax(stat_values_arr))
            central_point_argmax = central_points[stats_argmx]
            return central_point_argmax, (central_point_argmax - window_size, central_point_argmax + window_size)

In [None]:
def get_r_left_subsignal(signal: np.ndarray, split_id):
    left_sub_signal_arr = np.copy(signal)[:split_id, :]
    np.save("data_1/.temp/left_sub_signal.npy", left_sub_signal_arr)
    r_left_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/left_sub_signal.npy")
    return r_left_subsignal


def get_r_right_subsignal(signal: np.ndarray, split_id):
    right_sub_signal_arr = np.copy(signal)[split_id:, :]
    np.save("data_1/.temp/right_sub_signal.npy", right_sub_signal_arr)
    r_right_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/right_sub_signal.npy")
    return r_right_subsignal

In [None]:
def detect_multiple_bkps(n_bkps, signal, stable_set_length, min_seg_length, window_sizes, alpha, bkps, left_offset=0):
    bootstrap_stableSet = r_base.seq(1, stable_set_length)
    print('Looking for a cp')
    cov_cp_stat_test = r_covcp.covTest(window_sizes, alpha, signal, r_covcp.noPattern, r_covcp.infNorm, bootstrap_stableSet)
    cov_cp_loc = get_covcp_localization(cov_cp_stat_test, window_sizes)
    if cov_cp_loc is None:
        return bkps
    else:
        # add current bkp
        cp, _ = cov_cp_loc
        bkps.append(left_offset + cp)
        if len(bkps) < n_bkps:
            print(f"Detected cp: {cp}")
            # apply to left and right subsignal
            signal_arr = np.copy(np.asarray(signal))
            if cp - 1 > 2*min_seg_length:
                r_left_subsignal = get_r_left_subsignal(signal_arr, cp-1)
                # adapting window size
                if r_base.dim(r_left_subsignal)[0] < 2*window_sizes[0]:
                    window_sizes = [(r_base.dim(r_left_subsignal)[0])//2 - 1]
                print(f"Left window_size: {window_sizes}")
                detect_multiple_bkps(n_bkps, r_left_subsignal, stable_set_length, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset)
            if signal_arr.shape[0] - cp + 1 > 2*min_seg_length:
                r_right_subsignal = get_r_right_subsignal(signal_arr, cp-1)
                # adapting window size
                if r_base.dim(r_right_subsignal)[0] < 2*window_sizes[0]:
                    window_sizes = [(r_base.dim(r_right_subsignal)[0])//2 - 1]
                print(f"Right window_size: {window_sizes}")
                detect_multiple_bkps(n_bkps, r_right_subsignal, stable_set_length, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset+cp)
        return bkps

In [None]:
def run_r_covcp_algo(r_signal, gt_bkps: List[int], covcp_results: dict, stable_set_length:int, min_seg_length:int, window_sizes, alpha, exp_id):
    # running CPD algorithm
    t1 = time.perf_counter()
    covcp_bkps = detect_multiple_bkps(n_bkps=len(gt_bkps)-1, signal=r_signal, stable_set_length=stable_set_length, min_seg_length=min_seg_length, window_sizes=window_sizes, alpha=alpha, bkps=[])
    t2 = time.perf_counter()
    covcp_bkps.sort()
    covcp_bkps = [int(bkp) for bkp in covcp_bkps] + [r_base.dim(r_signal)[0]]
    # logging
    covcp_results[exp_id] = {}
    covcp_results[exp_id]["time"] = round(t2 - t1, ndigits=3)
    covcp_results[exp_id]["pred"] = covcp_bkps
    covcp_results[exp_id]["gt"] = gt_bkps
    covcp_results[exp_id]["n_bkps"] = len(gt_bkps)-1

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples = n_samples_1 + n_samples_2
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
X = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)
)
print(r_base.dim(X)[0])

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples_3 = 200
n_samples = n_samples_1 + n_samples_2 + n_samples_3
A1 = r_cp.gen_cov_mat(p, 1, "equal")
A2 = r_cp.gen_cov_mat(p, 1, "diagonal")
A3 = r_cp.gen_cov_mat(p, 1, "equal")
X_mult = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2), 
    r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)
)
print(r_base.dim(X_mult))

In [None]:
# Hyper-parameters
windows = r_base.c(50) 
alpha = 0.3
len_stable_set = 190

In [None]:
r_domc.registerDoMC(cores = 5)
r_base.set_seed(42)

In [None]:
bkps = detect_multiple_bkps(n_bkps=2, signal=X_mult, min_seg_length=len_stable_set, window_sizes=windows, alpha=alpha, bkps=[]) 
bkps.sort()
print(bkps)

In [None]:
path_signal_to_load = "data_1/signal/within_hyp/noisy_varying_segment_length/large_x1.0_SNR_10ER_20_nodes_deg_10_bandwidth_0.4"

r_signal = r_RcppCNPy.npyLoad(path_signal_to_load + '/2_signal.npy')
print(r_base.dim(r_signal))
gt_bkps = my_ut.open_json(path_signal_to_load + '/2_bkps.json')
n_bkps = len(gt_bkps) - 1
min_seg_length = 190
windows_sizes = r_base.c(100) 
alpha = 0.3

pred_bkps = detect_multiple_bkps(n_bkps=n_bkps, signal = r_signal, min_seg_length=min_seg_length, window_sizes=windows_sizes, alpha=alpha, bkps=[])
print(pred_bkps)

In [None]:
WINDOWS_SIZES_LIST = [[30], [50], [100], [150]]
for WINDOWS_SIZES in WINDOWS_SIZES_LIST:
    NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
    GRAPH_NAME =  NAME #"exp_geo_20_nodes_av_deg_10" #"ER_20_nodes_deg_10_bandwidth_0.4_edge_prop_0.05" 
    GRAPH_PATH =   "data_1/graphs/clean_ER_with_bandwidth"
    SIGNAL_PATH = "data_1/signal/within_hyp/noisy_varying_segment_length"
    SIGNAL_NAME =  "large_x1.0_SNR_10" + '_' + NAME
    MAX_ID_SUBSET = 0
    RESULT_DIR =  "results_1/synthetic/within_hypothesis_noisy/varying_segment_length/r_covcp_experiments" 

    R_COVCP_SEED = 42
    NB_CORES = 2
    LEVEL_ALPHA = 0.3
    STABLE_SET_LENGTH = (20*19)//2
    # WINDOWS_SIZES = [100]
    RESULT_NAME = f"test_alpha{LEVEL_ALPHA}_stablesetlength{STABLE_SET_LENGTH}_windows{'-'.join([str(w_s) for w_s in WINDOWS_SIZES])}" 

    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    # logging
    graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
    signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    seg_length_hyp = "large"

    covcp_description = "Test on the different hyper-parameters to improve results understanding and parametrization"
    covcp_metadata = {"datetime": now, "description": covcp_description, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": graph_path, "graph metadata": graph_metadata, "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME, "signal metadata": signal_metadata, "r_covcp seed": R_COVCP_SEED, "level alpha": LEVEL_ALPHA, "windows sizes": WINDOWS_SIZES, "length of the stable set": STABLE_SET_LENGTH, "nb_cores": NB_CORES, 'cpd algo func': detect_multiple_bkps.__name__}
    covcp_results = {}

    r_domc.registerDoMC(cores = NB_CORES)
    r_base.set_seed(R_COVCP_SEED)
    r_windows_sizes = r_base.c(WINDOWS_SIZES[0]) 

    # running CPD algorithms
    for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
        exp_id = str(exp_id)
        r_signal = r_RcppCNPy.npyLoad(signal_path + f'/{exp_id}_signal.npy')
        gt_bkps = my_ut.open_json(signal_path + f'/{exp_id}_bkps.json')
        run_r_covcp_algo(r_signal, gt_bkps, covcp_results, STABLE_SET_LENGTH, r_windows_sizes, LEVEL_ALPHA, exp_id)

    my_ut.create_parent_and_dump_json(results_dir, "covcp_metadata.json", my_ut.turn_all_list_of_dict_into_str(covcp_metadata), indent=4)
    # my_ut.create_parent_and_dump_json(results_dir, "r_covcp_pred.json", my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)

### Experiments with ```HaotianXu/changepoints```

In [None]:
p = 4
A1 = r_cp.gen_cov_mat(p, 1, "equal")
print(type(A1))
print(A1)
print(type(A1[0]))
print(A1[0])

In [None]:
p = 10
n_samples_1 = 300
n_samples_2 = 300
n_samples_3 = 300
n_samples_4 = 300
n_samples = n_samples_1 + n_samples_2 + n_samples_3 + n_samples_4
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
A3 = r_cp.gen_cov_mat(p, 10, "power")
A4 = r_cp.gen_cov_mat(p, 10, "power")
X = r_base.cbind(r_base.t(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)),
r_base.t(r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)),
r_base.t(r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)),
r_base.t(r_mass.mvrnorm(n = n_samples_4, mu = r_base.rep(0, p), Sigma = A4)))

In [None]:
print(r_base.dim(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)))
print(r_base.dim(X))

In [None]:
simple_BS_cov = r_cp.BS_cov(X, 1, n_samples)

print(type(simple_BS_cov))
print(simple_BS_cov)
print('length of the BS object:', r_base.length(simple_BS_cov))
print(type(simple_BS_cov[0]))
print(simple_BS_cov[0])
print(type(simple_BS_cov[1]))
print(simple_BS_cov[1])
print(type(simple_BS_cov[2]))
print(simple_BS_cov[2])
print(type(simple_BS_cov[3]))
print(simple_BS_cov[3])

In [None]:
threshold = 4
threshoded_dBS = r_cp.thresholdBS(simple_BS_cov, 10)

print(type(threshoded_dBS))
print(threshoded_dBS)
print('length of the thresholded BS object:', r_base.length(threshoded_dBS))
print(type(threshoded_dBS[0]))
print(threshoded_dBS[0])
print(type(threshoded_dBS[1]))
print(threshoded_dBS[1])