# Convergencia Tipo X

In [1]:
import numpy as np
import pandas as pd

import sys, os

from matplotlib.patches import Ellipse
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
import sys, os; sys.path.append(os.path.dirname(os.getcwd()))
from pyfrechet.metric_spaces import MetricData, LogCholesky, spd_to_log_chol, log_chol_to_spd
from pyfrechet.regression.bagged_regressor import BaggedRegressor
from pyfrechet.regression.trees import Tree
from sklearn.model_selection import train_test_split
from pyfrechet.metric_spaces import MetricData, LogEuclidean, CustomAffineInvariant, CustomLogEuclidean, AffineInvariant, LogCholesky, log_chol_to_spd, spd_to_log_chol

from scipy.special import digamma
from scipy.stats import wishart

from typing import Union
import random
from pyfrechet.metric_spaces import wasserstein_1d as ws
from pyfrechet.metric_spaces import MetricData, Wasserstein1D
from scipy import stats 
from typing import Union

INFO: Using numpy backend


## Functions

In [2]:
GRID = np.linspace(0.01, 0.99, 100)
STD_NORMAL_Q = stats.norm.ppf(GRID)


def sample_linear_transport(x, sig=1, gam=0.5):
    gam = np.random.gamma(0.5, 0.5)
    sig = np.random.exponential(0.5)
    Q0 = gam - np.log(1 + x) + (sig + x**2) * STD_NORMAL_Q
    return Q0 
    
def gen_data(N):
    # We know the values of Q in a grid, and we interpolate to estimate the values of Q in the new grid
    x = np.random.uniform(0,1, N)
    y = np.array([ sample_linear_transport(x[i]) for i in range(N)])
    
    return {'x': x, 'y': y}

In [3]:
# Obtain coverage results dataframe from the results files
def coverage_results() -> pd.DataFrame:
    """Create a dataframe with the data to be analyzed from the results files."""

    coverage_df=pd.DataFrame(columns=['sample_index', 'y_train_data', 'train_predictions', 'OOB_quantile', 'forest'])
    for file in os.listdir(os.path.join(os.getcwd(), 'wass_results')):
        if file.endswith('.npy'):
            infile=open(os.path.join(os.getcwd(), 'wass_results/' + file), 'rb')
            result=np.load(infile, allow_pickle=True).item()
            infile.close()
            coverage_df=pd.concat([coverage_df, 
                                    pd.DataFrame({  'sample_index': int(file.split('_')[1][4:]),
                                                    'y_train_data': [result['y_train_data']],
                                                    'train_predictions': [result['train_predictions']],
                                                    'forest': [result['forest']],
                                                }, index=pd.RangeIndex(0,1))],
                                    ignore_index=True)
        
    coverage_df['sample_index']=coverage_df['sample_index'].astype('category')
    return coverage_df

coverage_df=coverage_results()
#coverage_df_LC=coverage_results(dfs = dfs_names, dist = 'LC')
#coverage_df_LE=coverage_results(dfs = dfs_names, dist = 'LE')
#
#coverage_df_combined = pd.concat([coverage_df, coverage_df_LC, coverage_df_LE], ignore_index=True)
#print(coverage_df.info())
#print(coverage_df_LC.info())
#print(coverage_df_LE.info())

: 

In [7]:
n_estimations = 100

MC = 1000

zeros_init = np.zeros(shape = (n_estimations, 3))
cov = np.zeros(shape = (n_estimations, 3))

# Obtain 25 estimations of Type I coverage error for each distance and N, to calculate the mean of the estimations and the sample variance
M = Wasserstein1D()

for estimation in range(n_estimations):
    yesno = np.zeros(3)
    # Randomly select rows from the dataframe

    new_ts, new_ys = gen_data(MC).values()

    lns = coverage_df.sample(n=MC, replace = True)

    i = 0
    for _, ln in lns.iterrows():
        # Generate one random point to test if it belongs to the prediction ball
        new_t = new_ts[i]
        #new_t = np.random.uniform(size = 1)
        #Predict the new observation
        new_pred = ln['forest'].predict(new_t.reshape(-1,1))
        new_y = new_ys[i]
        yesno = np.vstack((yesno, M.d(new_pred, new_y) <= ln['OOB_quantile']))
        i += 1
    cov[estimation, :] = yesno[1:,:].sum(axis=0) / MC
    
cov

array([[0.974, 0.942, 0.894],
       [0.978, 0.937, 0.883],
       [0.973, 0.92 , 0.866],
       [0.98 , 0.939, 0.888],
       [0.975, 0.932, 0.886],
       [0.971, 0.925, 0.87 ],
       [0.972, 0.918, 0.857],
       [0.974, 0.917, 0.86 ],
       [0.981, 0.934, 0.884],
       [0.976, 0.935, 0.879],
       [0.963, 0.922, 0.861],
       [0.974, 0.93 , 0.876],
       [0.977, 0.924, 0.874],
       [0.964, 0.919, 0.874],
       [0.977, 0.931, 0.872],
       [0.984, 0.926, 0.869],
       [0.974, 0.924, 0.853],
       [0.98 , 0.921, 0.858],
       [0.98 , 0.917, 0.851],
       [0.979, 0.935, 0.89 ],
       [0.97 , 0.923, 0.863],
       [0.971, 0.926, 0.857],
       [0.97 , 0.928, 0.877],
       [0.976, 0.935, 0.887],
       [0.985, 0.94 , 0.879],
       [0.972, 0.924, 0.871],
       [0.972, 0.932, 0.868],
       [0.977, 0.931, 0.883],
       [0.97 , 0.912, 0.856],
       [0.971, 0.923, 0.875],
       [0.97 , 0.93 , 0.874],
       [0.975, 0.928, 0.874],
       [0.962, 0.901, 0.849],
       [0.