# Convergencia Tipo X

In [1]:
import numpy as np
import pandas as pd

import sys, os

from matplotlib.patches import Ellipse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
import sys, os; sys.path.append(os.path.dirname(os.getcwd()))
from pyfrechet.metric_spaces import MetricData, LogCholesky, spd_to_log_chol, log_chol_to_spd
from pyfrechet.regression.bagged_regressor import BaggedRegressor
from pyfrechet.regression.trees import Tree
from sklearn.model_selection import train_test_split
from pyfrechet.metric_spaces import MetricData, LogEuclidean, CustomAffineInvariant, CustomLogEuclidean, AffineInvariant, LogCholesky, log_chol_to_spd, spd_to_log_chol

from scipy.special import digamma
from scipy.stats import wishart

from typing import Union
import random

INFO: Using numpy backend


## Functions

In [2]:
def generate_random_spd_matrix(q_array, limits_unif = 30, seed = 1):
    """Generate a random q x q symmetric positive definite (SPD) matrix."""
    np.random.RandomState(seed)
    
    q_array = np.array(q_array, dtype = int)
    # Ensure the matrices are symmetric positive definite
    mat = [(np.random.rand(q_array[i], q_array[i])-1/2)*limits_unif for i in range(len(q_array))]
    return [np.dot(mat[i], mat[i].T) for i in range(len(q_array))]

# Define the matrices to interpolate
Sigma_1 = np.array([[1, -0.6],
                  [-0.6, 0.5]])
Sigma_2 = np.array([[1, 0],
                  [0, 1]])
Sigma_3 = np.array([[0.5, 0.4],
                  [0.4, 1]])

Sigmas = (Sigma_1, Sigma_2, Sigma_3)

def Sigma_t(t_array, Sigma_array):
    """Provides an array with the matrices given by a regression model that interpolates between four matrices."""  
    """The regression starts with Sigma_1 and then goes to Sigma_2 and Sigma_3 and ends in Sigma_4."""
    
    # Define time intervals for interpolation
    t_array = np.array(t_array)
    t_array = t_array[:, None, None]

    # Return the interpolated matrices
    return np.where(t_array < 0.5, np.cos(np.pi*t_array)**2 * Sigma_array[0] + (1 - np.cos(np.pi*(1-t_array))**2) * Sigma_array[1], 0) + np.where(t_array >= 0.5, (1 - np.cos(np.pi*t_array)**2) * Sigma_array[1] + np.cos(np.pi*(1-t_array))**2 * Sigma_array[2], 0)


def sim_regression_matrices(Sigmas: tuple,
                            t: np.array,
                            df: int=2):
    t = np.array(t)
    
    #Simulate the time for regression (sample_t) and the true time (true_t)
    q = Sigmas[0].shape[0]

    c_dq = 2 * np.exp((1 / q) * sum( digamma((df - np.arange(1, q + 1) + 1 ) / 2) ))
    sigma_t = Sigma_t(t, Sigmas)
    sample_Y = [wishart( df=df, scale = sigma_t[k] / c_dq ).rvs( size=1 ) for k in range(t.shape[0])]
    return {'t': t, 'y': sample_Y}


def plot_ellipse(mat: np.ndarray, ax, 
                 xy: tuple=(0,0),
                 scale_factor=1,
                 edgecolor='red',
                 facecolor='None',
                 linewidth=2,
                 alpha=1):
    eigenvalues, eigenvectors = np.linalg.eig(mat)
    theta = np.degrees(np.arctan2(*eigenvectors[:, 0][::-1]))
    ellipse = Ellipse(xy=xy,
                  width=scale_factor*np.sqrt(eigenvalues[0]),
                  height=scale_factor*np.sqrt(eigenvalues[1]),
                  angle=theta,
                  edgecolor=edgecolor,
                  facecolor=facecolor,
                  lw=linewidth,
                  alpha=alpha)
    ax.add_patch(ellipse)


def plot_OOB_balls_SPD( predictions: np.ndarray,
                        indices_to_plot: list[int],
                        Ralpha: float,
                        ax,
                        alpha: float = 0.05,
                        reference: Union[np.ndarray, None]=None,
                        scale_factor: float=1/10,
                        xy_factor: float=50,
                        df: int=5,
                        MC_samples: int=100,
                        edge_color='deepskyblue',
                        dist : str = 'LC',
                        limits_unif : int = 30
                        ) -> None:
    index_to_plot = 1
    if dist == 'LC':
        M = LogCholesky(dim = 2)
        if not reference is None:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(2, MC_samples), limits_unif = limits_unif, seed=4)
                sample = [spd_to_log_chol(A) for A in sample]
                for A in sample:
                    if M.d(A, predictions[index_to_plot])<=Ralpha:
                        plot_ellipse(log_chol_to_spd(A), ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                        

                plot_ellipse(log_chol_to_spd(predictions[index_to_plot]), ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)

                plot_ellipse(log_chol_to_spd(reference[index_to_plot]), ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='red', alpha=1)

        else:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(df, MC_samples), limits_unif = limits_unif, seed=4)
                for A in sample:
                    if M.d(A, predictions[index_to_plot])<=Ralpha:

                        plot_ellipse(log_chol_to_spd(A), ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                                    

                plot_ellipse(log_chol_to_spd(predictions[index_to_plot]), ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)
            

    elif dist == 'AI':
        M = CustomAffineInvariant(dim = 2)
        if not reference is None:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(2, MC_samples), limits_unif = limits_unif, seed=4)
                for A in sample:
                    if M.d(A, predictions[index_to_plot])<=Ralpha:
                        plot_ellipse(A, ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                        

                plot_ellipse(predictions[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)

                plot_ellipse(reference[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='red', alpha=1)

        else:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(df, MC_samples), limits_unif = limits_unif, seed=4)
    
                for A in sample:
                    if M.d(A, predictions[index_to_plot])<=Ralpha:

                        plot_ellipse(A, ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                                    

                plot_ellipse(predictions[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)
    else:
        M = LogEuclidean(dim = 2)
        if not reference is None:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(2, MC_samples), limits_unif = limits_unif, seed=4)
                for A in sample:

                    if M.d(A, predictions[index_to_plot])<=Ralpha:
                        plot_ellipse(A, ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                        

                plot_ellipse(predictions[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)

                plot_ellipse(reference[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='red', alpha=1)

        else:
            for index_to_plot in indices_to_plot:
                sample = generate_random_spd_matrix(q_array=np.repeat(df, MC_samples), limits_unif = limits_unif, seed=4)
                for A in sample:
                    if M.d(A, predictions[index_to_plot])<=Ralpha:

                        plot_ellipse(A, ax=ax, xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor=edge_color,
                                    alpha=alpha)
                                    

                plot_ellipse(predictions[index_to_plot], ax=ax, 
                            xy=(index_to_plot/xy_factor,0), scale_factor=scale_factor, edgecolor='black', alpha=1)

In [3]:
dfs_names = [2, 2.5, 3, 3.5, 4, 5, 6]

# Obtain coverage results dataframe from the results files
def coverage_results(dfs: list, dist: str= 'LC') -> pd.DataFrame:
    coverage_df=pd.DataFrame(columns=['sample_index', 'train_size', 'df', 'y_train_data', 'train_predictions', 'OOB_quantile', 'OOB_errors', 'forest'])
    for file in os.listdir(os.path.join(os.getcwd(), 'results')):
        if file.endswith('.npy') and file.split('_')[0] == dist:
            infile=open(os.path.join(os.getcwd(), 'results/' + file), 'rb')
            result=np.load(infile, allow_pickle=True).item()
            infile.close()
            coverage_df=pd.concat([coverage_df, 
                                    pd.DataFrame({  'distance': dist,
                                                    'sample_index': int(file.split('_')[2][4:]),
                                                    'train_size': int(file.split('_')[3][1:]),
                                                    'df': dfs[int(file.split('_')[4][2:])-1],
                                                    'y_train_data': [result['y_train_data']],
                                                    'train_predictions': [result['train_predictions']],
                                                    'OOB_quantile': [result['OOB_quantile']],
                                                    'OOB_errors': [result['OOB_errors']], 
                                                    'forest': [result['forest']],
                                                }, index=pd.RangeIndex(0,1))],
                                    ignore_index=True)
        
    coverage_df['train_size']=coverage_df['train_size'].astype('category')
    coverage_df['sample_index']=coverage_df['sample_index'].astype('category')
    coverage_df['df'] = coverage_df.df.astype('category')
    return coverage_df

coverage_df_AI=coverage_results(dfs = dfs_names, dist = 'AI')
#coverage_df_LC=coverage_results(dfs = dfs_names, dist = 'LC')
#coverage_df_LE=coverage_results(dfs = dfs_names, dist = 'LE')
#
#coverage_df_combined = pd.concat([coverage_df_AI, coverage_df_LC, coverage_df_LE], ignore_index=True)
#print(coverage_df_AI.info())
#print(coverage_df_LC.info())
#print(coverage_df_LE.info())

In [4]:
m = 20
n_estimations = 10

zeros_init = np.zeros(shape = (n_estimations, 3))
cov = np.zeros(shape = (n_estimations, 3))

diccionario = {
     'df_5': {'AI': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LC': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LE': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}}}

# Obtain 25 estimations of Type I coverage error for each distance and N, to calculate the mean of the estimations and the sample variance
for df in [5]:
    for dist in ['AI']:
        # Select the distance analyzed
        #if dist == 'AI':
        #    coverage_df = coverage_df_AI[coverage_df_AI['df'] == df]
        #    M = CustomAffineInvariant(dim = 2)
        #elif dist == 'LC':
        #    coverage_df = coverage_df_LC[coverage_df_LC['df'] == df]
        #    M = LogCholesky(dim = 2)
        #else
        #    coverage_df = coverage_df_LE[coverage_df_LE['df'] == df]
        #    M = LogEuclidean(dim = 2)

        coverage_df = coverage_df_AI[coverage_df_AI['df'] == df]
        M = CustomAffineInvariant(dim = 2)
        
        for N in [50, 100, 200, 500]:
            # Select the size of the training set
            coverage_df_N = coverage_df[coverage_df['train_size'] == N]
            for estimation in range(n_estimations):
                yesno = np.zeros(3)
                # Randomly select rows from the dataframe
                new_ts = np.random.uniform(size = m)
                new_ys = sim_regression_matrices(Sigmas = (Sigma_1, Sigma_2, Sigma_3), 
                                                t = new_ts,  
                                                df = df)
                lns = coverage_df_N.sample(n=m, replace=False)
        
                i = 0
                for _, ln in lns.iterrows():
                    # Generate one random point to test if it belongs to the prediction ball
                    new_t = new_ts[i]
                    #new_t = np.random.uniform(size = 1)
                    #Predict the new observation
                    new_pred = ln['forest'].predict_matrix(new_t.reshape(-1,1))
                    new_y = new_ys['y'][i]
                    #new_y = sim_regression_matrices(Sigmas = (Sigma_1, Sigma_2, Sigma_3), 
                    #            t = new_t,  
                    #            df = df)['y'][0]
                    # Store the selected values
                    yesno = np.vstack((yesno, M.d(new_pred, new_y) <= ln['OOB_quantile']))
                    i += 1
                cov[estimation, :] = yesno[1:,:].sum(axis=0) / m
                
            diccionario['df_'+str(df)][dist][str(N)] = np.copy(cov)

In [5]:
m = 20
n_estimations = 10

zeros_init = np.zeros(shape = (n_estimations, 3))
cov = np.zeros(shape = (n_estimations, 3))

diccionario = {'df_2': {'AI': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LC': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LE': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}},  'df_5': {'AI': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LC': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}, 'LE': {'50': zeros_init, '100': zeros_init, '200': zeros_init, '500': zeros_init}}}

# Obtain 25 estimations of Type I coverage error for each distance and N, to calculate the mean of the estimations and the sample variance
for df in [2, 5]:
    for dist in ['AI']:
        # Select the distance analyzed
        #if dist == 'AI':
        #    coverage_df = coverage_df_AI[coverage_df_AI['df'] == df]
        #    M = CustomAffineInvariant(dim = 2)
        #elif dist == 'LC':
        #    coverage_df = coverage_df_LC[coverage_df_LC['df'] == df]
        #    M = LogCholesky(dim = 2)
        #else:
        #    coverage_df = coverage_df_LE[coverage_df_LE['df'] == df]
        #    M = LogEuclidean(dim = 2)

        coverage_df = coverage_df_AI[coverage_df_AI['df'] == df]
        M = CustomAffineInvariant(dim = 2)
        
        for N in [50, 100, 200, 500]:
            # Select the size of the training set
            coverage_df_N = coverage_df[coverage_df['train_size'] == N]
            for estimation in range(n_estimations):
                yesno = np.zeros(3)
                # Randomly select rows from the dataframe
                #new_ts = np.random.uniform(size = m)
                #new_ys = sim_regression_matrices(Sigmas = (Sigma_1, Sigma_2, Sigma_3), 
                #                                t = new_ts,  
                #                                df = df)
                lns = coverage_df_N.sample(n=m, replace=False)
        
                i = 0
                for _, ln in lns.iterrows():
                    # Generate one random point to test if it belongs to the prediction ball
                    new_t = np.random.uniform(size = 1)
                    #Predict the new observation
                    new_pred = ln['forest'].predict_matrix(new_t.reshape(-1,1))
                    #new_y = new_ys['y'][i]
                    new_y = sim_regression_matrices(Sigmas = (Sigma_1, Sigma_2, Sigma_3), 
                                t = new_t,  
                                df = df)['y'][0]
                    # Store the selected values
                    yesno = np.vstack((yesno, M.d(new_pred, new_y) <= ln['OOB_quantile']))
                    i += 1
                cov[estimation, :] = yesno[1:,:].sum(axis=0) / m
                
            diccionario['df_'+str(df)][dist][str(N)] = np.copy(cov)



In [6]:
for df in [2, 5]:    
    for dist in ['AI']:
        for N in [50, 100, 200, 500]:
            print(f"{df} degrees of freedom, N = {N}, {dist} distance, mean of Type I coverage estimates: ", np.mean(diccionario['df_'+str(df)][dist][str(N)], axis = 0)) 
            print(f"{df} degrees of freedom, N = {N}, {dist} distance, standard deviation of Type I coverage estimates: ", np.sqrt(np.var(diccionario['df_'+str(df)][dist][str(N)], axis = 0))  )    

2 degrees of freedom, N = 50, AI distance, mean of Type I coverage estimates:  [0.995 0.94  0.895]
2 degrees of freedom, N = 50, AI distance, standard deviation of Type I coverage estimates:  [0.015      0.05385165 0.07566373]
2 degrees of freedom, N = 100, AI distance, mean of Type I coverage estimates:  [0.985 0.96  0.93 ]
2 degrees of freedom, N = 100, AI distance, standard deviation of Type I coverage estimates:  [0.02291288 0.04358899 0.04      ]
2 degrees of freedom, N = 200, AI distance, mean of Type I coverage estimates:  [0.99  0.95  0.905]
2 degrees of freedom, N = 200, AI distance, standard deviation of Type I coverage estimates:  [0.02       0.03872983 0.04153312]
2 degrees of freedom, N = 500, AI distance, mean of Type I coverage estimates:  [0.995 0.95  0.885]
2 degrees of freedom, N = 500, AI distance, standard deviation of Type I coverage estimates:  [0.015      0.03162278 0.06344289]
5 degrees of freedom, N = 50, AI distance, mean of Type I coverage estimates:  [0.97  

In [7]:
# Prepare data for the DataFrame
rows = []
index = []

for df in [2, 5]:
    for N in [50, 100, 200, 500]:
        row = []
        for dist in ['AI', 'LC', 'LE']:
            means = np.mean(diccionario[f'df_{df}'][dist][str(N)], axis=0)
            stds = np.sqrt(np.var(diccionario[f'df_{df}'][dist][str(N)], axis=0))
            # Format as "mean (std)"
            formatted_values = [f"{means[i]:.4f} ({stds[i]:.4f})" for i in range(3)]
            row.extend(formatted_values)
        rows.append(row)
        index.append((f"df={df}", f"N={N}"))

# MultiIndex for rows and columns
row_index = pd.MultiIndex.from_tuples(index, names=["df", "N"])
col_index = pd.MultiIndex.from_product(
    [["AI", "LC", "LE"], ["0.01", "0.05", "0.1"]],
    names=["Distance", "Significance Level"]
)

# Create the DataFrame
df = pd.DataFrame(rows, index=row_index, columns=col_index)

# Display the DataFrame
df


Unnamed: 0_level_0,Distance,AI,AI,AI,LC,LC,LC,LE,LE,LE
Unnamed: 0_level_1,Significance Level,0.01,0.05,0.1,0.01,0.05,0.1,0.01,0.05,0.1
df,N,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
df=2,N=50,0.9950 (0.0150),0.9400 (0.0539),0.8950 (0.0757),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=2,N=100,0.9850 (0.0229),0.9600 (0.0436),0.9300 (0.0400),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=2,N=200,0.9900 (0.0200),0.9500 (0.0387),0.9050 (0.0415),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=2,N=500,0.9950 (0.0150),0.9500 (0.0316),0.8850 (0.0634),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=5,N=50,0.9700 (0.0332),0.9250 (0.0559),0.9100 (0.0700),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=5,N=100,0.9850 (0.0320),0.9450 (0.0522),0.8900 (0.0735),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=5,N=200,0.9650 (0.0391),0.9500 (0.0387),0.8800 (0.0510),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)
df=5,N=500,0.9950 (0.0150),0.9750 (0.0335),0.9250 (0.0680),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000),0.0000 (0.0000)


In [8]:
import pandas as pd
def format_cell(value):
    mean, std = value.split(" ")
    mean = f"{float(mean):.3f}"
    std = std.strip("()")
    std = f"({float(std):.3f})"
    return f"{mean} {std}"

# Apply formatting to all cells
df = df.applymap(format_cell)

latex = df.to_latex(index=True, multirow=True, multicolumn=True, multicolumn_format='c', bold_rows=True, float_format= "%.3f" , caption='Type I error coverage for different distances, degrees of freedom and sample sizes', label='tab:typeIerrorcoverage')

  df = df.applymap(format_cell)


In [9]:
print(latex)

\begin{table}
\caption{Type I error coverage for different distances, degrees of freedom and sample sizes}
\label{tab:typeIerrorcoverage}
\begin{tabular}{lllllllllll}
\toprule
 & Distance & \multicolumn{3}{c}{AI} & \multicolumn{3}{c}{LC} & \multicolumn{3}{c}{LE} \\
 & Significance Level & 0.01 & 0.05 & 0.1 & 0.01 & 0.05 & 0.1 & 0.01 & 0.05 & 0.1 \\
df & N &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{4}{*}{\textbf{df=2}} & \textbf{N=50} & 0.995 (0.015) & 0.940 (0.054) & 0.895 (0.076) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) \\
\textbf{} & \textbf{N=100} & 0.985 (0.023) & 0.960 (0.044) & 0.930 (0.040) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) \\
\textbf{} & \textbf{N=200} & 0.990 (0.020) & 0.950 (0.039) & 0.905 (0.042) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) & 0.000 (0.000) \\
\textbf{} & \textbf{N=500} & 0.995 (0.015) & 0.950 (0.032)