# Test on ```rpy2```

In [1]:
import os
import time

import numpy as np
import json

import utils as my_ut
import result_related as my_res

from datetime import datetime
from typing import List
from tqdm import tqdm

## ```rpy2``` utils

In [2]:
import rpy2
import rpy2.robjects as robjects

## To aid in printing HTML in notebooks
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

- Project '~/dev/graph-signals-change-point-detection' loaded. [renv 1.0.7]

NOTE: Dependency discovery took 41 seconds during snapshot.
Consider using .renvignore to ignore files, or switching to explicit snapshots.
See `?renv::dependencies` for more information.



In [3]:
from rpy2.robjects.packages import importr

r_base = importr('base')
r_utils = importr('utils')
r_cp = importr('changepoints')
r_covcp = importr('covcp')
r_mass = importr('MASS')
r_domc = importr('doMC')
r_parallel = importr('parallel')
r_RcppCNPy = importr('RcppCNPy')
r_reticulate = importr('reticulate')

rprint = robjects.globalenv.find("print")


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [4]:
def get_nested_named_element_from_R_list(obj, named_elem:str):
    nested_names_list = named_elem.split('$')
    target_obj = obj
    for name in nested_names_list:
        if name.isdigit():
            target_obj = target_obj.rx2(int(name))
        else:
            target_obj = target_obj.rx2(name)
    return target_obj

In [5]:
def turn_r_bool_in_py_bool(r_bool):
    str_r_bool = str(r_bool.rx2(1))[4:9]
    return not str_r_bool == 'FALSE'

## Sandbox on CPD algorithms

### Experiments with ```akopich/covcp```

Comprehénsion de la méthode : 

- un cpd est déclaré si pour une des taille de fenêtres on atteint, pour un des points centraux (de la fenêtre) une valeur de la statistique supérieure à la valeur critique (le seuil) calculé pour cette taille de fenêtre. 
- a priori les valeurs critiques sont calculées grâce au bootstrap ('data-driven approach')
- pour confirmer mon intuition sur la compréhension de la méthode :
    - aller voir dans la résolution de l'algo si je trouve d'où viennent les "critical values" pour confirmer qu'elles sont bien calculées grâce au bootstrap
    - a priori dans mon expérience on ne détecte le cp que pour la plus grande taillede fenêtre (parce que meilleure estimation ?)
        - aller voir comment évoluent les valeurs de la critical value en fonction de la taille de la fenêtre
        --> **les valeurs critiques diminuent systématiquement avec la taille des fenêtres considérées**
        - éventuellement faire des plot pour assess tout ça, et comprendre un peu mieux grâce à l'article comment fonctionne le bootstrap
    - concernant le niveau alpha :
        - comment influence les résultats : revenir à la définition pour voir si cohérent
        - voir si intervient bien dans le bootstrap au moment du calcul des valeurs critiques : censé les diminuer ou augmenter
    - comment influence la taille de l'échantillon 'stable' pour le bootstrap ?
        --> **jusque là, les expériences réalisées montrent que lorsqu'on diminue la taille du stableset, on diminue aussi les valeurs critiques**
    - comment influence la taille des segments de stationarité ?

A faire en code : 

- transformer les données python en données R (voir doc rpy2 sur le paragraphe associé)
- implémenter la détection de plusieurs ruptures (relire ce qu'ils disent dans l'article à ce sujet)

In [6]:
def get_covcp_localization(covcp_results, window_sizes):
    # check if there is actually a cp to localize
    is_there_cp = r_covcp.isRejected(covcp_results)
    if not is_there_cp:
        return None
    # if so, return the smallest window in which the cp can be localized
    ciritical_values = covcp_results.rx2('criticalValue')
    for i, window_size in enumerate(window_sizes):
        window_stat_results = get_nested_named_element_from_R_list(covcp_results, f'statistics$window2statistics${i+1}')
        max_stat = window_stat_results.rx2('statistics')[0]
        if max_stat > ciritical_values[i]:
            central_points = window_stat_results.rx2('centralPoints')
            stat_values_arr = np.asarray(window_stat_results.rx2('distances'))
            stats_argmx = int(np.argmax(stat_values_arr))
            central_point_argmax = central_points[stats_argmx]
            return central_point_argmax, (central_point_argmax - window_size, central_point_argmax + window_size)

In [7]:
def get_r_left_subsignal(signal: np.ndarray, split_id):
    left_sub_signal_arr = np.copy(signal)[:split_id, :]
    np.save("data_1/.temp/left_sub_signal.npy", left_sub_signal_arr)
    r_left_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/left_sub_signal.npy")
    return r_left_subsignal


def get_r_right_subsignal(signal: np.ndarray, split_id):
    right_sub_signal_arr = np.copy(signal)[split_id:, :]
    np.save("data_1/.temp/right_sub_signal.npy", right_sub_signal_arr)
    r_right_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/right_sub_signal.npy")
    return r_right_subsignal

In [8]:
def detect_multiple_bkps(n_bkps, signal, min_seg_length, window_sizes, alpha, bkps, left_offset=0):
    bootstrap_stableSet = r_base.seq(1, min_seg_length)
    cov_cp_stat_test = r_covcp.covTest(window_sizes, alpha, signal, r_covcp.noPattern, r_covcp.infNorm, bootstrap_stableSet)
    cov_cp_loc = get_covcp_localization(cov_cp_stat_test, window_sizes)
    if cov_cp_loc is None:
        return bkps
    else:
        # add current bkp
        cp, _ = cov_cp_loc
        bkps.append(left_offset + cp)
        if len(bkps) < n_bkps:
            # apply to left and right subsignal
            signal_arr = np.copy(np.asarray(signal))
            if cp - 1 > 2*min_seg_length:
                r_left_subsignal = get_r_left_subsignal(signal_arr, cp-1)
                detect_multiple_bkps(n_bkps, r_left_subsignal, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset)
            if signal_arr.shape[0] - cp + 1 > 2*min_seg_length:
                r_right_subsignal = get_r_right_subsignal(signal_arr, cp-1)
                detect_multiple_bkps(n_bkps, r_right_subsignal, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset+cp)
        return bkps

In [9]:
def run_r_covcp_algo(r_signal, gt_bkps: List[int], covcp_results: dict, min_seg_length:int, window_sizes, alpha, exp_id):
    # running CPD algorithm
    t1 = time.perf_counter()
    covcp_bkps = detect_multiple_bkps(n_bkps=len(gt_bkps)-1, signal=r_signal, min_seg_length=min_seg_length, window_sizes=window_sizes, alpha=alpha, bkps=[])
    t2 = time.perf_counter()
    covcp_bkps.sort()
    covcp_bkps = [int(bkp) for bkp in covcp_bkps] + [r_base.dim(r_signal)[0]]
    # logging
    covcp_results[exp_id] = {}
    covcp_results[exp_id]["time"] = round(t2 - t1, ndigits=3)
    covcp_results[exp_id]["pred"] = covcp_bkps
    covcp_results[exp_id]["gt"] = gt_bkps
    covcp_results[exp_id]["n_bkps"] = len(gt_bkps)-1

In [10]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples = n_samples_1 + n_samples_2
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
X = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)
)
print(r_base.dim(X)[:])

[1] 800  20



In [11]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples_3 = 200
n_samples = n_samples_1 + n_samples_2 + n_samples_3
A1 = r_cp.gen_cov_mat(p, 1, "equal")
A2 = r_cp.gen_cov_mat(p, 1, "diagonal")
A3 = r_cp.gen_cov_mat(p, 1, "equal")
X_mult = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2), 
    r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)
)
print(r_base.dim(X_mult))

[1] 1000   20



In [12]:
# Hyper-parameters
windows = r_base.c(50) 
alpha = 0.3
len_stable_set = 190

In [None]:
r_domc.registerDoMC(cores = 5)
r_base.set_seed(42)

In [None]:
bkps = detect_multiple_bkps(n_bkps=2, signal=X_mult, min_seg_length=len_stable_set, window_sizes=windows, alpha=alpha, bkps=[]) 
bkps.sort()
print(bkps)

In [None]:
path_signal_to_load = "data_1/signal/within_hyp/noisy_varying_segment_length/large_x1.0_SNR_10ER_20_nodes_deg_10_bandwidth_0.4"

r_signal = r_RcppCNPy.npyLoad(path_signal_to_load + '/2_signal.npy')
print(r_base.dim(r_signal))
gt_bkps = open_json(path_signal_to_load + '/2_bkps.json')
n_bkps = len(gt_bkps) - 1
min_seg_length = 190
windows_sizes = r_base.c(100) 
alpha = 0.3

pred_bkps = detect_multiple_bkps(n_bkps=n_bkps, signal = r_signal, min_seg_length=min_seg_length, window_sizes=windows_sizes, alpha=alpha, bkps=[])
print(pred_bkps)

In [13]:
WINDOWS_SIZES_LIST = [[30], [50], [100], [150]]
for WINDOWS_SIZES in WINDOWS_SIZES_LIST:
    NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
    GRAPH_NAME =  NAME #"exp_geo_20_nodes_av_deg_10" #"ER_20_nodes_deg_10_bandwidth_0.4_edge_prop_0.05" 
    GRAPH_PATH =   "data_1/graphs/clean_ER_with_bandwidth"
    SIGNAL_PATH = "data_1/signal/within_hyp/noisy_varying_segment_length"
    SIGNAL_NAME =  "large_x1.0_SNR_10" + '_' + NAME
    MAX_ID_SUBSET = 0
    RESULT_DIR =  "results_1/synthetic/within_hypothesis_noisy/varying_segment_length/r_covcp_experiments" 

    R_COVCP_SEED = 42
    NB_CORES = 2
    LEVEL_ALPHA = 0.3
    STABLE_SET_LENGTH = (20*19)//2
    # WINDOWS_SIZES = [100]
    RESULT_NAME = f"test_alpha{LEVEL_ALPHA}_stablesetlength{STABLE_SET_LENGTH}_windows{'-'.join([str(w_s) for w_s in WINDOWS_SIZES])}" 

    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    # logging
    graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
    signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    seg_length_hyp = "large"

    covcp_description = "Test on the different hyper-parameters to improve results understanding and parametrization"
    covcp_metadata = {"datetime": now, "description": covcp_description, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": graph_path, "graph metadata": graph_metadata, "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME, "signal metadata": signal_metadata, "r_covcp seed": R_COVCP_SEED, "level alpha": LEVEL_ALPHA, "windows sizes": WINDOWS_SIZES, "length of the stable set": STABLE_SET_LENGTH, "nb_cores": NB_CORES, 'cpd algo func': detect_multiple_bkps.__name__}
    covcp_results = {}

    r_domc.registerDoMC(cores = NB_CORES)
    r_base.set_seed(R_COVCP_SEED)
    r_windows_sizes = r_base.c(WINDOWS_SIZES[0]) 

    # running CPD algorithms
    for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
        exp_id = str(exp_id)
        r_signal = r_RcppCNPy.npyLoad(signal_path + f'/{exp_id}_signal.npy')
        gt_bkps = my_ut.open_json(signal_path + f'/{exp_id}_bkps.json')
        run_r_covcp_algo(r_signal, gt_bkps, covcp_results, STABLE_SET_LENGTH, r_windows_sizes, LEVEL_ALPHA, exp_id)

    my_ut.create_parent_and_dump_json(results_dir, "covcp_metadata.json", my_ut.turn_all_list_of_dict_into_str(covcp_metadata), indent=4)
    # my_ut.create_parent_and_dump_json(results_dir, "r_covcp_pred.json", my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)

Running experiment...: 0it [00:00, ?it/s]
Running experiment...: 0it [00:00, ?it/s]
Running experiment...: 0it [00:00, ?it/s]
Running experiment...: 0it [00:00, ?it/s]


### Experiments with ```HaotianXu/changepoints```

In [None]:
p = 4
A1 = r_cp.gen_cov_mat(p, 1, "equal")
print(type(A1))
print(A1)
print(type(A1[0]))
print(A1[0])

In [None]:
p = 10
n_samples_1 = 300
n_samples_2 = 300
n_samples_3 = 300
n_samples_4 = 300
n_samples = n_samples_1 + n_samples_2 + n_samples_3 + n_samples_4
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
A3 = r_cp.gen_cov_mat(p, 10, "power")
A4 = r_cp.gen_cov_mat(p, 10, "power")
X = r_base.cbind(r_base.t(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)),
r_base.t(r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)),
r_base.t(r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)),
r_base.t(r_mass.mvrnorm(n = n_samples_4, mu = r_base.rep(0, p), Sigma = A4)))

In [None]:
print(r_base.dim(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)))
print(r_base.dim(X))

In [None]:
simple_BS_cov = r_cp.BS_cov(X, 1, n_samples)

print(type(simple_BS_cov))
print(simple_BS_cov)
print('length of the BS object:', r_base.length(simple_BS_cov))
print(type(simple_BS_cov[0]))
print(simple_BS_cov[0])
print(type(simple_BS_cov[1]))
print(simple_BS_cov[1])
print(type(simple_BS_cov[2]))
print(simple_BS_cov[2])
print(type(simple_BS_cov[3]))
print(simple_BS_cov[3])

In [None]:
threshold = 4
threshoded_dBS = r_cp.thresholdBS(simple_BS_cov, 10)

print(type(threshoded_dBS))
print(threshoded_dBS)
print('length of the thresholded BS object:', r_base.length(threshoded_dBS))
print(type(threshoded_dBS[0]))
print(threshoded_dBS[0])
print(type(threshoded_dBS[1]))
print(threshoded_dBS[1])