# Test on ```rpy2```

## Utils

In [None]:
import numpy as np
import json

In [None]:
def open_json(path):
    with open(path, 'r+') as f:
        content = json.load(f)
    return content

## ```rpy2``` utils

In [None]:
import rpy2
import rpy2.robjects as robjects

## To aid in printing HTML in notebooks
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

In [None]:
is_r_cp_installed = robjects.r("system.file(package='changepoints')")
print(is_r_cp_installed)

In [None]:
from rpy2.robjects.packages import importr

r_base = importr('base')
r_utils = importr('utils')
r_cp = importr('changepoints')
r_covcp = importr('covcp')
r_mass = importr('MASS')
r_domc = importr('doMC')
r_parallel = importr('parallel')
r_RcppCNPy = importr('RcppCNPy')
r_reticulate = importr('reticulate')

rprint = robjects.globalenv.find("print")

In [None]:
print(r_base.c(1, 3, 3, 4, 5))
print(type(r_base.c(1, 3, 3, 4, 5)))

In [None]:
robjects.r('z <- setNames(1:3, c("a", "b", "c"))')
z = robjects.globalenv['z']
print(z)

In [None]:
def get_nested_named_element_from_R_list(obj, named_elem:str):
    nested_names_list = named_elem.split('$')
    target_obj = obj
    for name in nested_names_list:
        if name.isdigit():
            target_obj = target_obj.rx2(int(name))
        else:
            target_obj = target_obj.rx2(name)
    return target_obj

In [None]:
def turn_r_bool_in_py_bool(r_bool):
    str_r_bool = str(r_bool.rx2(1))[4:9]
    return not str_r_bool == 'FALSE'

## Sandbox on CPD algorithms

### Experiments with ```akopich/covcp```

Comprehénsion de la méthode : 

- un cpd est déclaré si pour une des taille de fenêtres on atteint, pour un des points centraux (de la fenêtre) une valeur de la statistique supérieure à la valeur critique (le seuil) calculé pour cette taille de fenêtre. 
- a priori les valeurs critiques sont calculées grâce au bootstrap ('data-driven approach')
- pour confirmer mon intuition sur la compréhension de la méthode :
    - aller voir dans la résolution de l'algo si je trouve d'où viennent les "critical values" pour confirmer qu'elles sont bien calculées grâce au bootstrap
    - a priori dans mon expérience on ne détecte le cp que pour la plus grande taillede fenêtre (parce que meilleure estimation ?)
        - aller voir comment évoluent les valeurs de la critical value en fonction de la taille de la fenêtre
        --> **les valeurs critiques diminuent systématiquement avec la taille des fenêtres considérées**
        - éventuellement faire des plot pour assess tout ça, et comprendre un peu mieux grâce à l'article comment fonctionne le bootstrap
    - concernant le niveau alpha :
        - comment influence les résultats : revenir à la définition pour voir si cohérent
        - voir si intervient bien dans le bootstrap au moment du calcul des valeurs critiques : censé les diminuer ou augmenter
    - comment influence la taille de l'échantillon 'stable' pour le bootstrap ?
        --> **jusque là, les expériences réalisées montrent que lorsqu'on diminue la taille du stableset, on diminue aussi les valeurs critiques**
    - comment influence la taille des segments de stationarité ?

A faire en code : 

- transformer les données python en données R (voir doc rpy2 sur le paragraphe associé)
- implémenter la détection de plusieurs ruptures (relire ce qu'ils disent dans l'article à ce sujet)

In [None]:
def get_covcp_localization(covcp_results, window_sizes):
    # check if there is actually a cp to localize
    is_there_cp = r_covcp.isRejected(covcp_results)
    if not is_there_cp:
        return None
    # if so, return the smallest window in which the cp can be localized
    ciritical_values = covcp_results.rx2('criticalValue')
    for i, window_size in enumerate(window_sizes):
        window_stat_results = get_nested_named_element_from_R_list(covcp_results, f'statistics$window2statistics${i+1}')
        max_stat = window_stat_results.rx2('statistics')[0]
        if max_stat > ciritical_values[i]:
            central_points = window_stat_results.rx2('centralPoints')
            stat_values_arr = np.asarray(window_stat_results.rx2('distances'))
            stats_argmx = int(np.argmax(stat_values_arr))
            central_point_argmax = central_points[stats_argmx]
            return central_point_argmax, (central_point_argmax - window_size, central_point_argmax + window_size)

In [None]:
def get_r_left_subsignal(signal: np.ndarray, split_id):
    left_sub_signal_arr = np.copy(signal)[:split_id, :]
    np.save("data_1/.temp/left_sub_signal.npy", left_sub_signal_arr)
    r_left_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/left_sub_signal.npy")
    return r_left_subsignal


def get_r_right_subsignal(signal: np.ndarray, split_id):
    right_sub_signal_arr = np.copy(signal)[split_id:, :]
    np.save("data_1/.temp/right_sub_signal.npy", right_sub_signal_arr)
    r_right_subsignal = r_RcppCNPy.npyLoad("data_1/.temp/right_sub_signal.npy")
    return r_right_subsignal

In [None]:
def detect_multiple_bkps(n_bkps, signal, min_seg_length, window_sizes, alpha, bkps, left_offset=0):
    print('bkps: ', bkps)
    bootstrap_stableSet = r_base.seq(1, min_seg_length)
    cov_cp_stat_test = r_covcp.covTest(window_sizes, alpha, signal, r_covcp.noPattern, r_covcp.infNorm, bootstrap_stableSet)
    cov_cp_loc = get_covcp_localization(cov_cp_stat_test, window_sizes)
    if cov_cp_loc is None:
        return bkps
    else:
        # add current bkp
        cp, _ = cov_cp_loc
        print('cp: ', cp)
        bkps.append(left_offset + cp)
        if len(bkps) < n_bkps:
            # apply to left and right subsignal
            signal_arr = np.copy(np.asarray(signal))
            if cp - 1 > 2*min_seg_length:
                r_left_subsignal = get_r_left_subsignal(signal_arr, cp-1)
                print('r_left_subsignal.shape: ', np.asarray(r_left_subsignal).shape)
                detect_multiple_bkps(n_bkps, r_left_subsignal, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset)
            if signal_arr.shape[0] - cp + 1 > 2*min_seg_length:
                r_right_subsignal = get_r_right_subsignal(signal_arr, cp-1)
                print('r_right_subsignal.shape: ', np.asarray(r_right_subsignal).shape)
                detect_multiple_bkps(n_bkps, r_right_subsignal, min_seg_length, window_sizes, alpha, bkps, left_offset=left_offset+cp)
        return bkps

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples = n_samples_1 + n_samples_2
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
X = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)
)
print(r_base.dim(X))

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples_3 = 200
n_samples = n_samples_1 + n_samples_2 + n_samples_3
A1 = r_cp.gen_cov_mat(p, 1, "equal")
A2 = r_cp.gen_cov_mat(p, 1, "diagonal")
A3 = r_cp.gen_cov_mat(p, 1, "equal")
X_mult = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2), 
    r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)
)
print(r_base.dim(X_mult))

In [None]:
# Hyper-parameters
windows = r_base.c(50) 
alpha = 0.3
len_stable_set = 190

In [None]:
r_domc.registerDoMC(cores = 4) # set the multithreaded environment up
r_base.set_seed(42)

In [None]:
bkps = detect_multiple_bkps(n_bkps=2, signal=X_mult, min_seg_length=len_stable_set, window_sizes=windows, alpha=alpha, bkps=[]) 
bkps.sort()
print(bkps)

In [None]:
path_signal_to_load = "data_1/signal/within_hyp/noisy_varying_segment_length/large_x1.0_SNR_10ER_20_nodes_deg_10_bandwidth_0.4"

r_signal = r_RcppCNPy.npyLoad(path_signal_to_load + '/2_signal.npy')
print(r_base.dim(r_signal))
gt_bkps = open_json(path_signal_to_load + '/2_bkps.json')
n_bkps = len(gt_bkps) - 1
min_seg_length = 190
windows_sizes = r_base.c(100) 
alpha = 0.3

pred_bkps = detect_multiple_bkps(n_bkps=n_bkps, signal = r_signal, min_seg_length=min_seg_length, window_sizes=windows_sizes, alpha=alpha, bkps=[])
print(pred_bkps)

### Experiments with ```HaotianXu/changepoints```

In [None]:
p = 4
A1 = r_cp.gen_cov_mat(p, 1, "equal")
print(type(A1))
print(A1)
print(type(A1[0]))
print(A1[0])

In [None]:
p = 10
n_samples_1 = 300
n_samples_2 = 300
n_samples_3 = 300
n_samples_4 = 300
n_samples = n_samples_1 + n_samples_2 + n_samples_3 + n_samples_4
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
A3 = r_cp.gen_cov_mat(p, 10, "power")
A4 = r_cp.gen_cov_mat(p, 10, "power")
X = r_base.cbind(r_base.t(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)),
r_base.t(r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)),
r_base.t(r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)),
r_base.t(r_mass.mvrnorm(n = n_samples_4, mu = r_base.rep(0, p), Sigma = A4)))

In [None]:
print(r_base.dim(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)))
print(r_base.dim(X))

In [None]:
simple_BS_cov = r_cp.BS_cov(X, 1, n_samples)

print(type(simple_BS_cov))
print(simple_BS_cov)
print('length of the BS object:', r_base.length(simple_BS_cov))
print(type(simple_BS_cov[0]))
print(simple_BS_cov[0])
print(type(simple_BS_cov[1]))
print(simple_BS_cov[1])
print(type(simple_BS_cov[2]))
print(simple_BS_cov[2])
print(type(simple_BS_cov[3]))
print(simple_BS_cov[3])

In [None]:
threshold = 4
threshoded_dBS = r_cp.thresholdBS(simple_BS_cov, 10)

print(type(threshoded_dBS))
print(threshoded_dBS)
print('length of the thresholded BS object:', r_base.length(threshoded_dBS))
print(type(threshoded_dBS[0]))
print(threshoded_dBS[0])
print(type(threshoded_dBS[1]))
print(threshoded_dBS[1])