# Test on ```rpy2```

In [None]:
import os
import time

import numpy as np

import utils as my_ut
import result_related as my_res
import running_cpd as my_cpd

from math import floor
from datetime import datetime
from typing import List
from tqdm import tqdm
from sklearn.covariance import log_likelihood

## ```rpy2``` utils

In [None]:
import rpy2
import rpy2.robjects as robjects

## To aid in printing HTML in notebooks
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

In [None]:
from rpy2.robjects.packages import importr

r_base = importr('base')
r_utils = importr('utils')
r_cp = importr('changepoints')
r_covcp = importr('covcp')
r_mass = importr('MASS')
r_domc = importr('doMC')
r_parallel = importr('parallel')
r_RcppCNPy = importr('RcppCNPy')
r_reticulate = importr('reticulate')
r_glasso = importr('glasso')

rprint = robjects.globalenv.find("print")

## Sandbox on CPD algorithms

### Experiments with ```glasso```

Je peux bien travailler avec la fonction loglikelihood de scikit learn, quitte à la ré-implémenter de mon côté (bien faire attention à ce que les signes soient les bons, qu'on ne maximise pas à l'envers). Ce qu'il reste donc à faire c'est de faire les calculs de matrices de précision avec la librairie R.

In [None]:
some_signal_path = 'data_1/.temp/1_left_sub_signal.npy'
r_signal = r_RcppCNPy.npyLoad(some_signal_path)
print(r_base.dim(r_signal)[1])

In [None]:
pen_coef = r_covcp.chooseRho(r_signal)
emp_cov = r_covcp.Cov(r_signal)
glasso_invcov_mat = r_glasso.glasso(s=emp_cov, rho=pen_coef).rx2('wi')

In [None]:
### SYNTHETIC DATA ADAPTED
##########################

NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
GRAPH_PATH = "data_1/synthetic_data/graphs/clean_ER_with_bandwidth"
GRAPH_NAME =  NAME  #+ f"_edge_prop_{EDGE_PROP}" #"exp_geo_20_nodes_av_deg_10" #"ER_20_nodes_deg_10_bandwidth_0.4_edge_prop_0.05" 
# SIGNAL_PATH = "sandbox/test/" #"data_1/signal/within_hyp/SNR_20_varying_segment_length"
SIGNAL_PATH = "sandbox/test"
# SIGNAL_PATH = "data_1/synthetic_data/signal/within_hyp/SNR_20_censor_breakdown/varying_nb_breakdowns" #"data_1/signal/within_hyp/SNR_20_varying_segment_length"
EXP_ID_LIST = list(range(0, 4))
RESULT_DIR = "sandbox/test/results"
LASSO_ALPHA = "from_covcp_paper"
MIN_SEGMENT_LENGTH_COEF = 0.4
SNR = 20
BKPS_GAP_CONSTRAINT = "large"
N_BKPS = 4
NB_BREAKDOWN = 2
BREAKDOWN_LENGTH = 300


OTHER_GRAPH_SEED = 1

NB_CORES = 1
LEVEL_ALPHA = 0.3
STABLE_SET_LENGTH = 80
WINDOWS_SIZE = [80]
COVCP_TEMP_ID = 1
COVCP_SEED = 1

LASSO_TEMP_ID = 1
PEN_MULT_COEF_LIST = [7]

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# logging
graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
seg_length_hyp = "minimal"
graph_rng = np.random.default_rng(OTHER_GRAPH_SEED)

    
# SIGNAL_NAME = f"{BKPS_GAP_CONSTRAINT}_x{MIN_SEGMENT_LENGTH_COEF}_SNR_{round(SNR, 4)}" +  f"_NBbd_{NB_BREAKDOWN}_bklength_{BREAKDOWN_LENGTH}" + "_" + GRAPH_NAME 
SIGNAL_NAME = f"SNR_20_and_fixed_min_size_900_samples_and_bkdwn_ER_20_nodes_deg_10_bandwidth_0.4"
signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")

RESULT_NAME = ""
final_name = SIGNAL_NAME #+ "_" + RESULT_NAME
results_dir = os.path.join(RESULT_DIR, final_name)

exp_desc = "Test of the graphical Lasso cost function: grid seach over the L1 penalty coefficient. Its value is fixed according to the fromula from the covcp paper and a multiplicative coefficient."
experiment_metadata = {"datetime": now, "description": exp_desc, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": graph_path, "graph metadata": graph_metadata, "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME, "signal metadata": signal_metadata, "min segment length hypothesis": seg_length_hyp, "lasso penalty coefficient": LASSO_ALPHA, "level_alpha": LEVEL_ALPHA, "stable_set_length": STABLE_SET_LENGTH, "windows_size": WINDOWS_SIZE, "nb_cores": NB_CORES, "r_covcp_seed": COVCP_SEED}

# output formatting
# lasso_results = {"0": 'INIT'}
# lasso_name = f"lasso_pred_pencoefmult_{PEN_COEF}.json"
# lasso_json_path = os.path.join(results_dir, lasso_name)
# my_ut.create_parent_and_dump_json(results_dir, lasso_name, my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)
covcp_results = {"0": 'INIT'}
covcp_name = f"covcp_pred_windsize_{WINDOWS_SIZE[0]}.json"
covcp_json_path = os.path.join(results_dir, covcp_name)
my_ut.create_parent_and_dump_json(results_dir, covcp_name, my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)

# running CPD algorithms
# for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
for exp_id in EXP_ID_LIST:
    print(f"\n\tOver exp_id={exp_id}...")
    exp_id = str(exp_id)
    G, signal, gt_bkps, min_size = my_ut.load_data(graph_path, signal_path, exp_id, seg_length_hyp)
    signal_file_path = f"{signal_path}/{exp_id}"
    # my_cpd.run_r_glasso_cpd_algo_and_store(signal, gt_bkps, lasso_json_path, exp_id, PEN_COEF, lasso_intermediate_temp_path)
    my_cpd.run_r_covcp_cpd_algo_and_store(signal_file_path, gt_bkps, covcp_json_path, STABLE_SET_LENGTH, min_size, WINDOWS_SIZE, LEVEL_ALPHA, exp_id, NB_CORES, COVCP_SEED)
    

    # my_ut.create_parent_and_dump_json(results_dir, F"lasso_pred_pencoefmult_{PEN_COEF}.json", my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)
# my_ut.create_parent_and_dump_json(results_dir, "experiment_metadata.json", my_ut.turn_all_list_of_dict_into_str(experiment_metadata), indent=4)


In [None]:
### REAL DATA ADAPTED
##########################

# NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
GRAPH_PATH = "data/real_data/test_filtered_0.5-40_order_3_subsampled_8"
GRAPH_NAME =  "KNN_4_64_ch_graph_mat_adj_order_signal_header"
SIGNAL_PATH = "data/real_data/test_filtered_0.5-40_order_3_subsampled_8" 
RESULT_DIR = "results_1/real_data/test/filtered_0.5-40_order_3_subsampled_8"
LASSO_ALPHA = "from_covcp_paper"
seg_length_hyp = "minimal"

# GRAPH LASSO METHOD
PEN_MULT_COEF_LIST = [4]

# R COVCP METHOD
R_COVCP_SEED = 42
NB_CORES = 2
LEVEL_ALPHA = 0.3
STABLE_SET_LENGTH = 80
WINDOWS_SIZE = [80]


now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# logging
graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
# graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
# graph_rng = np.random.default_rng(OTHER_GRAPH_SEED)

for PEN_COEF in PEN_MULT_COEF_LIST:

    print(f"\n\n \t Working with multiplicative coefficient: {PEN_COEF} \n")
    
    # SIGNAL_NAME = f"{BKPS_GAP_CONSTRAINT}_x{MIN_SEGMENT_LENGTH_COEF}_SNR_{round(SNR, 4)}" + "_" + GRAPH_NAME 
    SIGNAL_NAME = "volunteerS007_exp04"
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    # signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    
    RESULT_NAME = f"running_time_test_with_mult_coef_{PEN_COEF}_{LASSO_ALPHA}"
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    exp_desc = "Test of the graphical Lasso cost function."
    experiment_metadata = {"datetime": now, "description": exp_desc, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": "no graph", "graph metadata": "no graph", "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME}

    # output formatting
    # statio_results = {}
    # normal_results = {}
    lasso_results = {}
    covcp_results = {}

    # running CPD algorithms
    # for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
    # G, signal, gt_bkps, min_size = my_ut.load_data(graph_path, signal_path, exp_id, seg_length_hyp)
    # signal = np.load(f"{signal_path}_signal.npy", allow_pickle=False).T
    # bkps = my_ut.open_json(f"{signal_path}_bkps.json")
    # run_r_glasso_cpd_algo(signal, gt_bkps, lasso_results, exp_id, PEN_COEF)
    r_signal = r_base.t(r_RcppCNPy.npyLoad(signal_path + '_signal.npy'))
    gt_bkps = my_ut.open_json(signal_path + '_bkps.json')
    min_size = 64
    run_r_covcp_algo(r_signal, gt_bkps, covcp_results, STABLE_SET_LENGTH, min_size, WINDOWS_SIZE, LEVEL_ALPHA, 0)

    my_ut.create_parent_and_dump_json(results_dir, F"r_covcp_level_{LEVEL_ALPHA}_stableset_{STABLE_SET_LENGTH}_adapted_windowsize_{WINDOWS_SIZE[0]}_pred.json", my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)
    # my_ut.create_parent_and_dump_json(results_dir, F"lasso_pred_pencoefmult_{PEN_COEF}.json", my_ut.turn_all_list_of_dict_into_str(lasso_results), indent=4)

# my_ut.create_parent_and_dump_json(results_dir, "experiment_metadata.json", my_ut.turn_all_list_of_dict_into_str(experiment_metadata), indent=4)

#### Results computation

In [None]:
### SYNTHETIC DATA ADAPTED
###########################

PRECI_RECALL_MARGIN = 5
res_folder_root = "results_1/synthetic/within_hypothesis_noisy/glasso_experiments/SNR_20_large_x0.4_1000_samples/4_bkps_large_x0.4_SNR_20_1000_samples_ER_20_nodes_deg_10_bandwidth_0.4_pen_coef_gridsearch_from_covcp_paper"
file_names = os.listdir(res_folder_root)
# PRED_FOLDER = [os.path.join(res_folder_root, file_name) for file_name in file_names]  #'.txt' not in file_name]
PRED_FOLDER = [res_folder_root]

# pencoef_mult_list =  [1, 2, 4]
# pencoef_mult_list =  [0.6, 0.8, 1.0, 1.2, 1.4, 6, 8, 10, 12, 14, 20, 30]
# pencoef_mult_list =  [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 25]
# pencoef_mult_list =  [0.001, 0.01, 0.1, 1, 10, 100]
pencoef_mult_list =  [0.001, 0.5, 1, 5, 10, 20, 100]

for pred_dir in PRED_FOLDER:

    # fetching predictions
    data_stats = my_ut.open_json(f"{pred_dir}/experiment_metadata.json")

    # output formatting
    metrics_dic = {}
    metrics_dic["pred_path"] = pred_dir
    metrics_dic["hyper-parameters"] = data_stats
    metrics_dic["hyper-parameters"]["metrics_margin"] = PRECI_RECALL_MARGIN
    full_results = {}

    for pencoefmult in pencoef_mult_list:

        lasso_pencoef_pred_dic = my_ut.open_json(f"{pred_dir}/lasso_pred_pencoefmult_{pencoefmult}.json")
        pencoef_glasso_results = {"recall": {'raw': []}, "precision": {'raw': []}, "f1_score": {'raw': []}, "hausdorff": {'raw': []}, "assignement_cost": {'raw': []}, "time": {"raw": []}}

        for exp_id in lasso_pencoef_pred_dic.keys():
            # compute metrics
            lasso_pred_bkps = my_ut.turn_str_of_list_into_list_of_int(lasso_pencoef_pred_dic[exp_id]["pred"])
            gt_bkps = my_ut.turn_str_of_list_into_list_of_int(lasso_pencoef_pred_dic[exp_id]["gt"])
            my_res.compute_and_update_metrics(gt_bkps, lasso_pred_bkps, pencoef_glasso_results, PRECI_RECALL_MARGIN)
            my_res.compute_assignement_cost(gt_bkps, lasso_pred_bkps, pencoef_glasso_results)
            pencoef_glasso_results["time"]["raw"].append(lasso_pencoef_pred_dic[exp_id]["time"])

        full_results[pencoefmult] = pencoef_glasso_results
        
    # results post-precessing and saving
    full_results = my_res.compute_and_add_stat_on_metrics(full_results)
    full_results["metadata"] = metrics_dic
    full_results = my_ut.turn_all_list_of_dict_into_str(full_results)
    my_ut.create_parent_and_dump_json(pred_dir, f'metrics_{PRECI_RECALL_MARGIN}.json', full_results, indent=4)

### Experiments with ```akopich/covcp```

Comprehénsion de la méthode : 

- un cpd est déclaré si pour une des taille de fenêtres on atteint, pour un des points centraux (de la fenêtre) une valeur de la statistique supérieure à la valeur critique (le seuil) calculé pour cette taille de fenêtre. 
- a priori les valeurs critiques sont calculées grâce au bootstrap ('data-driven approach')
- pour confirmer mon intuition sur la compréhension de la méthode :
    - aller voir dans la résolution de l'algo si je trouve d'où viennent les "critical values" pour confirmer qu'elles sont bien calculées grâce au bootstrap
    - a priori dans mon expérience on ne détecte le cp que pour la plus grande taillede fenêtre (parce que meilleure estimation ?)
        - aller voir comment évoluent les valeurs de la critical value en fonction de la taille de la fenêtre
        --> **les valeurs critiques diminuent systématiquement avec la taille des fenêtres considérées**
        - éventuellement faire des plot pour assess tout ça, et comprendre un peu mieux grâce à l'article comment fonctionne le bootstrap
    - concernant le niveau alpha :
        - comment influence les résultats : revenir à la définition pour voir si cohérent
        - voir si intervient bien dans le bootstrap au moment du calcul des valeurs critiques : censé les diminuer ou augmenter
    - comment influence la taille de l'échantillon 'stable' pour le bootstrap ?
        --> **jusque là, les expériences réalisées montrent que lorsqu'on diminue la taille du stableset, on diminue aussi les valeurs critiques**
    - comment influence la taille des segments de stationarité ?

A faire en code : 

- transformer les données python en données R (voir doc rpy2 sur le paragraphe associé)
- implémenter la détection de plusieurs ruptures (relire ce qu'ils disent dans l'article à ce sujet)

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples = n_samples_1 + n_samples_2
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
X = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)
)
print(r_base.dim(X)[0])

In [None]:
# Signal generation
p = 20
n_samples_1 = 400
n_samples_2 = 400
n_samples_3 = 200
n_samples = n_samples_1 + n_samples_2 + n_samples_3
A1 = r_cp.gen_cov_mat(p, 1, "equal")
A2 = r_cp.gen_cov_mat(p, 1, "diagonal")
A3 = r_cp.gen_cov_mat(p, 1, "equal")
X_mult = r_base.rbind(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1),
    r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2), 
    r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)
)
print(r_base.dim(X_mult))

In [None]:
# Hyper-parameters
windows = r_base.c(50) 
alpha = 0.3
len_stable_set = 190

In [None]:
r_domc.registerDoMC(cores = 5)
r_base.set_seed(42)

In [None]:
WINDOWS_SIZES_LIST = [[30], [50], [100], [150]]
for WINDOWS_SIZES in WINDOWS_SIZES_LIST:
    NAME =  "ER_20_nodes_deg_10_bandwidth_0.4"
    GRAPH_NAME =  NAME #"exp_geo_20_nodes_av_deg_10" #"ER_20_nodes_deg_10_bandwidth_0.4_edge_prop_0.05" 
    GRAPH_PATH =   "data_1/graphs/clean_ER_with_bandwidth"
    SIGNAL_PATH = "data_1/signal/within_hyp/noisy_varying_segment_length"
    SIGNAL_NAME =  "large_x1.0_SNR_10" + '_' + NAME
    MAX_ID_SUBSET = 0
    RESULT_DIR =  "results_1/synthetic/within_hypothesis_noisy/varying_segment_length/r_covcp_experiments" 

    R_COVCP_SEED = 42
    NB_CORES = 2
    LEVEL_ALPHA = 0.3
    STABLE_SET_LENGTH = (20*19)//2
    # WINDOWS_SIZES = [100]
    RESULT_NAME = f"test_alpha{LEVEL_ALPHA}_stablesetlength{STABLE_SET_LENGTH}_windows{'-'.join([str(w_s) for w_s in WINDOWS_SIZES])}" 

    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    final_name = SIGNAL_NAME + "_" + RESULT_NAME
    results_dir = os.path.join(RESULT_DIR, final_name)

    # logging
    graph_path = os.path.join(GRAPH_PATH, GRAPH_NAME)
    signal_path = os.path.join(SIGNAL_PATH, SIGNAL_NAME)
    graph_metadata = my_ut.open_json(f"{graph_path}/00_graphs_metadata.json")
    signal_metadata = my_ut.open_json(f"{signal_path}/00_signal_metadata.json")
    seg_length_hyp = "large"

    covcp_description = "Test on the different hyper-parameters to improve results understanding and parametrization"
    covcp_metadata = {"datetime": now, "description": covcp_description, "commit hash": my_ut.get_git_head_short_hash(), "graph folder": graph_path, "graph metadata": graph_metadata, "signal folder": SIGNAL_PATH + '/' + SIGNAL_NAME, "signal metadata": signal_metadata, "r_covcp seed": R_COVCP_SEED, "level alpha": LEVEL_ALPHA, "windows sizes": WINDOWS_SIZES, "length of the stable set": STABLE_SET_LENGTH, "nb_cores": NB_CORES, 'cpd algo func': detect_multiple_bkps.__name__}
    covcp_results = {}

    r_domc.registerDoMC(cores = NB_CORES)
    r_base.set_seed(R_COVCP_SEED)
    r_windows_sizes = r_base.c(WINDOWS_SIZES[0]) 

    # running CPD algorithms
    for exp_id in tqdm(range(MAX_ID_SUBSET), desc='Running experiment...'):
        exp_id = str(exp_id)
        r_signal = r_RcppCNPy.npyLoad(signal_path + f'/{exp_id}_signal.npy')
        gt_bkps = my_ut.open_json(signal_path + f'/{exp_id}_bkps.json')
        run_r_covcp_algo(r_signal, gt_bkps, covcp_results, STABLE_SET_LENGTH, r_windows_sizes, LEVEL_ALPHA, exp_id)

    my_ut.create_parent_and_dump_json(results_dir, "covcp_metadata.json", my_ut.turn_all_list_of_dict_into_str(covcp_metadata), indent=4)
    # my_ut.create_parent_and_dump_json(results_dir, "r_covcp_pred.json", my_ut.turn_all_list_of_dict_into_str(covcp_results), indent=4)

### Experiments with ```HaotianXu/changepoints```

In [None]:
p = 4
A1 = r_cp.gen_cov_mat(p, 1, "equal")
print(type(A1))
print(A1)
print(type(A1[0]))
print(A1[0])

In [None]:
p = 10
n_samples_1 = 300
n_samples_2 = 300
n_samples_3 = 300
n_samples_4 = 300
n_samples = n_samples_1 + n_samples_2 + n_samples_3 + n_samples_4
A1 = r_cp.gen_cov_mat(p, 10, "equal")
A2 = r_cp.gen_cov_mat(p, 10, "diagonal")
A3 = r_cp.gen_cov_mat(p, 10, "power")
A4 = r_cp.gen_cov_mat(p, 10, "power")
X = r_base.cbind(r_base.t(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)),
r_base.t(r_mass.mvrnorm(n = n_samples_2, mu = r_base.rep(0, p), Sigma = A2)),
r_base.t(r_mass.mvrnorm(n = n_samples_3, mu = r_base.rep(0, p), Sigma = A3)),
r_base.t(r_mass.mvrnorm(n = n_samples_4, mu = r_base.rep(0, p), Sigma = A4)))

In [None]:
print(r_base.dim(r_mass.mvrnorm(n = n_samples_1, mu = r_base.rep(0, p), Sigma = A1)))
print(r_base.dim(X))

In [None]:
simple_BS_cov = r_cp.BS_cov(X, 1, n_samples)

print(type(simple_BS_cov))
print(simple_BS_cov)
print('length of the BS object:', r_base.length(simple_BS_cov))
print(type(simple_BS_cov[0]))
print(simple_BS_cov[0])
print(type(simple_BS_cov[1]))
print(simple_BS_cov[1])
print(type(simple_BS_cov[2]))
print(simple_BS_cov[2])
print(type(simple_BS_cov[3]))
print(simple_BS_cov[3])

In [None]:
threshold = 4
threshoded_dBS = r_cp.thresholdBS(simple_BS_cov, 10)

print(type(threshoded_dBS))
print(threshoded_dBS)
print('length of the thresholded BS object:', r_base.length(threshoded_dBS))
print(type(threshoded_dBS[0]))
print(threshoded_dBS[0])
print(type(threshoded_dBS[1]))
print(threshoded_dBS[1])