In [None]:
###################### Distintos tipos de bandwith para ir probando
dict_bandwith = {
    'normal_mean': ((4/(3*len(raw_data)))**(1/5)) * np.std(raw_data),
    'normal_mean_iqr': ((4/(3*len(raw_data)))**(1/5)) * (np.percentile(raw_data, 75) - np.percentile(raw_data, 25)),
    "scott": 1.06 * np.std(raw_data) * (len(raw_data) ** (-1 / 5))
}
##############################################################

In [None]:
# ejemplo internet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
from scipy.stats import norm, lognorm, gamma, poisson, chisquare, kstest, pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rand_seed = 100

def make_data_normal(data_count=100):
    np.random.seed(rand_seed)
    x = np.random.normal(0, 1, data_count)
    dist = lambda z: stats.norm(0, 1).pdf(z)
    return x, dist

def make_data_binormal(data_count=100):
    alpha = 0.3
    np.random.seed(rand_seed)
    x = np.concatenate([
        np.random.normal(-1, 2, int(data_count * alpha)),
        np.random.normal(5, 1, int(data_count * (1 - alpha)))
    ])
    dist = lambda z: alpha * stats.norm(-1, 2).pdf(z) + (1 - alpha) * stats.norm(5, 1).pdf(z)
    return x, dist

def make_data_exp(data_count=100):
    alpha = 0.3
    np.random.seed(rand_seed)
    x = np.concatenate([
        np.random.exponential(1, int(data_count * alpha)),
        np.random.exponential(1, int(data_count * (1 - alpha))) + 1
    ])
    dist = lambda z: alpha * stats.expon(0).pdf(z) + (1 - alpha) * stats.expon(1).pdf(z)
    return x, dist

def make_data_uniform(data_count=100):
    alpha = 0.3
    np.random.seed(rand_seed)
    x = np.concatenate([
        np.random.uniform(-1, 1, int(data_count * alpha)),
        np.random.uniform(0, 1, int(data_count * (1 - alpha)))
    ])
    dist = lambda z: alpha * stats.uniform(-1, 1).pdf(z) + (1 - alpha) * stats.uniform(0, 1).pdf(z)
    return x, dist

x_norm, dist_norm = make_data_normal()
x_binorm, dist_binorm = make_data_binormal()
x_exp, dist_exp = make_data_exp()
x_uni, dist_uni = make_data_uniform()

fig, ax = plt.subplots(1, 4, figsize=(12, 3))
names = ['Normal', 'Bi-normal', 'Exponential', 'Bi-Uniform']
for i, d in enumerate([dist_norm, dist_binorm, dist_exp, dist_uni]):
    x = np.linspace(-8, 8, 100)
    ax[i].fill(x, d(x), color='C0', alpha=0.5)
    ax[i].set_ylim(0, 1)
    ax[i].set_xlim(-8, 8)
    ax[i].set_xlabel('x')
    ax[i].set_ylabel('p(x)')
    ax[i].set_title(names[i])
fig.tight_layout()
fig.show()

def kernel(k: str):
    """Kernel Functions.
    Ref: https://en.wikipedia.org/wiki/Kernel_(statistics)

    Args:
        k (str): Kernel name. Can be one of ['gaussian', 'epanechnikov', 'cosine', 'linear'.]
    """
    
    if k not in ['gaussian', 'epanechnikov', 'cosine', 'linear']:
        raise ValueError('Unknown kernel.')

    def bounded(f):
        def _f(x):
            return f(x) if np.abs(x) <= 1 else 0
        return _f

    if k == 'gaussian':
        return lambda u: 1 / np.sqrt(2 * np.pi) * np.exp(-1 / 2 * u * u)
    elif k == 'epanechnikov':
        return bounded(lambda u: (3 / 4 * (1 - u * u)))
    elif k =='cosine':
        return bounded(lambda u: np.pi / 4 * np.cos(np.pi / 2 * u))
    elif k == 'linear':
        return bounded(lambda u: 1 - np.abs(u))
    
def bw_scott(data: np.ndarray):
    std_dev = np.std(data, axis=0, ddof=1)
    n = len(data)
    return 3.49 * std_dev * n ** (-0.333)

def bw_silverman(data: np.ndarray):
    def _select_sigma(x):
        normalizer = 1.349
        iqr = (stats.scoreatpercentile(x, 75) - stats.scoreatpercentile(x, 25)) / normalizer
        std_dev = np.std(x, axis=0, ddof=1)
        return np.minimum(std_dev, iqr) if iqr > 0 else std_dev
    sigma = _select_sigma(data)
    n = len(data)
    return 0.9 * sigma * n ** (-0.2)

def bw_mlcv(data: np.ndarray, k):
    """
    Ref: https://rdrr.io/cran/kedd/src/R/MLCV.R
    """
    n = len(data)
    x = np.linspace(np.min(data), np.max(data), n)
    def mlcv(h):
        fj = np.zeros(n)
        for j in range(n):
            for i in range(n):
                if i == j: continue
                fj[j] += k((x[j] - data[i]) / h)
            fj[j] /= (n - 1) * h
        return -np.mean(np.log(fj[fj > 0]))
    h = optimize.minimize(mlcv, 1)
    if np.abs(h.x[0]) > 10:
        return bw_scott(data)
    return h.x[0]

def kde(data, k=None, h=None, x=None):
    """Kernel Density Estimation.

    Args:
        data (np.ndarray): Data.
        k (function): Kernel function.
        h (float): Bandwidth.
        x (np.ndarray, optional): Grid. Defaults to None.

    Returns:
        np.ndarray: Kernel density estimation.
    """
    if x is None:
        x = np.linspace(np.min(data), np.max(data), 1000)
    if h is None:
        h = bw_silverman(data)
    if k is None:
        k = kernel('gaussian')
    n = len(data)
    kde = np.zeros_like(x)
    for j in range(len(x)):
        for i in range(n):
            kde[j] += k((x[j] - data[i]) / h)
        kde[j] /= n * h
    return kde

data = [
    ('Normal', make_data_normal),
    ('Bimodal (Normal)', make_data_binormal),
    ('Bimodal (Exp)', make_data_exp),
    ('Bimodal (Uniform)', make_data_uniform)
]
kernels = [
    ('Gaussian', kernel('gaussian')),
    ('Epanechnikov', kernel('epanechnikov')),
    ('Cosine', kernel('cosine')),
    ('Linear', kernel('linear'))
]
bw_algorithms = [
    ('Scott', bw_scott),
    ('Silverman', bw_silverman),
    ('MLCV', bw_mlcv),
]
mses = []

def run_kde(ax, data, kernel):
    x, dist = data[1]()
    x_plot = np.linspace(np.min(x) * 1.05, np.max(x) * 1.05, 1000)
    ax.grid(True)
    ax.fill_between(x_plot, dist(x_plot), fc='silver', alpha=0.5)
    ax.plot(x, np.full_like(x, -0.02), '|k', markeredgewidth=1)
    ax.hist(x, density=True, alpha=0.2, bins=20, rwidth=0.9)
    for bw in bw_algorithms:
        if bw[0] == 'MLCV':
            h = bw[1](x, kernel[1])
        else:
            h = bw[1](x)
        x_kde = kde(x, kernel[1], h=h, x=x_plot)
        mse = np.mean((dist(x_plot) - x_kde) ** 2)
        mses.append({
            'data': data[0],
            'kernel': kernel[0],
            'bw_algorithm': bw[0],
            'h': round(h, 5),
            'mse': round(mse * 1000, 5), # To make differences more noticable
        })
        ax.plot(x_plot, x_kde, linewidth=1, label='$h_{\mathrm{' + bw[0] + '}} = ' + str(round(h, 5)) + '$')
    ax.legend(loc='best', fontsize='small')
    ax.set_title(f'{data[0]}, {kernel[0]}')

fig, axs = plt.subplots(len(data), len(kernels), figsize=(16, 12))

for i, d in enumerate(data):
    for j, k in enumerate(kernels):
        run_kde(axs[i, j], d, k)
    for bw in bw_algorithms:
        avg_h = np.mean([m['h'] for m in mses if m['data'] == d[0] and m['bw_algorithm'] == bw[0]])
        avg_mse = np.mean([m['mse'] for m in mses if m['data'] == d[0] and m['bw_algorithm'] == bw[0]])
        mses.append({
            'data': d[0],
            'kernel': '-',
            'bw_algorithm': bw[0],
            'h': round(avg_h, 5),
            'mse': round(avg_mse, 5),
        })

fig.tight_layout()
fig.show()
fig.savefig('eval.pdf')
pd.DataFrame(mses).to_csv('eval.csv', index=False)


In [None]:
for i in range(1,4):
    for grd in range(1,9):
        for unidad in ["ICU", "SDU_WARD"]:
            # ---- 1. Filter your dataset
            tl_u = tl[tl["UNIDAD"].isin(["ICU", "OR", "SDU_WARD"])]
            v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd) & (tl_u["HOSPITAL"] == f"Hospital_{i}")]
            # v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd)]

            # ---- 2. Get LOS value counts
            vector = v1["LOS"].value_counts().reset_index().sort_values(by="LOS")
            vector.columns = ["LOS", "count"]

            # ---- 3. Build raw data from LOS values and their frequencies
            los = np.array(vector["LOS"])
            ocurrencias = np.array(vector["count"])
            raw_data = np.repeat(los, ocurrencias)


            # ---- 4. Definir los bins de 12 horas
            bin_width = 12
            max_los = np.max(raw_data)
            edges = np.arange(0, max_los + bin_width, bin_width)
            midpoints = (edges[:-1] + edges[1:]) / 2  # midpoint of (a, b] is (a + b) / 2
    

            # ---- 5. Meter datos a bins individuales (0 - 12], (12 - 24], ...)
            # agrega 0 si no hay datos para ese bin
            bin_indices = np.digitize(raw_data, edges, right=True)
            hist = np.array([(bin_indices == i).sum() for i in range(1, len(edges))])

#--------------------------------Hasta aca esta bien----------------------------------------

            # ---- 6. Fit log-normal distribution to the raw data
            shape, loc, scale = stats.lognorm.fit(raw_data, floc=0)



            # # ---- Optimización
            # def chi2_objective(params, hist, edges, total_count):
            #     shape, loc, scale = params
            #     # Ensure parameters are valid
            #     if shape <= 0 or scale <= 0:
            #         return np.inf
            #     cdf_low = stats.lognorm.cdf(edges[:-1], shape, loc, scale)
            #     cdf_high = stats.lognorm.cdf(edges[1:], shape, loc, scale)
            #     expected_probs = cdf_high - cdf_low
            #     expected_counts = expected_probs * total_count

            #     # Mask for bins with enough expected values
            #     mask = expected_counts >= 5
            #     if not np.any(mask):
            #         return np.inf
            #     obs = hist[mask]
            #     exp = expected_counts[mask]
                
            #     # Normalize expected to match sum of obs
            #     exp *= obs.sum() / exp.sum()

            #     chi2 = np.sum((obs - exp)**2 / exp)
            #     return chi2
            # initial_shape, initial_loc, initial_scale = stats.lognorm.fit(raw_data, floc=0)
            # # Minimize the chi-square
            # result = minimize(
            #     chi2_objective,
            #     x0=[initial_shape, initial_loc, initial_scale],
            #     args=(hist, edges, len(raw_data)),
            #     bounds=[(1e-5, None), (0, None), (1e-5, None)],
            #     method='L-BFGS-B'
            # )
            # # Extract optimized parameters
            # opt_shape, opt_loc, opt_scale = result.x
            # # print(f"Optimized shape={opt_shape:.4f}, loc={opt_loc:.4f}, scale={opt_scale:.2f}")
            # shape = opt_shape
            # loc = opt_loc
            # scale = opt_scale


            # # ---- 7. Compute expected counts per bin using log-normal CDF
            # cdf_low = stats.lognorm.cdf(edges[:-1], shape, loc, scale)
            # cdf_high = stats.lognorm.cdf(edges[1:], shape, loc, scale)
            # expected_probs = cdf_high - cdf_low
            # # Acumulo todo lo que queda de la cola en el ultimo bin
            # # expected_probs[-1] = 1 - (sum(expected_probs) - expected_probs[-1])
            # expected_counts = expected_probs * len(raw_data)


            # # ---- 9. Chi-square test (only where expected counts ≥ 5)
            # mask = expected_counts >= 5
            # obs = hist[mask]
            # exp = expected_counts[mask]

            # # display(obs)
            # # display(exp)

            # # Normalize expected counts to match the total of observed
            # if exp.sum() > 0:
            #     exp *= obs.sum() / exp.sum()

            # # Chi-square test
            # if np.any(mask):
            #     chi2, p = stats.chisquare(f_obs=obs, f_exp=exp)
            #     if p > 0.05:
            #         print(f"Chi² = {chi2:.2f}, p-value = {p:.10f}, Unidad={unidad}, Hospital={i}, GRD={grd}")
                    
            #         # ---- 8. Plot observed vs. expected
            #         plt.figure(figsize=(12, 6))
            #         plt.bar(midpoints, hist, width=bin_width - 2, alpha=0.6, label='Observed (12h bins)')
            #         plt.plot(midpoints, expected_counts, 'r--o', label='Log-normal expected')
            #         plt.xlabel('Length of Stay (hours)')
            #         plt.ylabel('Frequency')
            #         plt.title(f'LOS Histogram vs Log-normal Fit\nUnidad={unidad}, Hospital={i}, GRD={grd}')
            #         plt.grid(True)

            #         # Curva continua
            #         x_vals = np.linspace(1, max_los, 500)
            #         pdf_vals = stats.lognorm.pdf(x_vals, s=shape, loc=loc, scale=scale)
            #         pdf_scaled = pdf_vals * len(raw_data) * bin_width
            #         plt.plot(x_vals, pdf_scaled, 'g-', label='Log-normal PDF (smoothed)')

            #         plt.legend()
            #         plt.show()
            #         break

                    
            # else:
            #     print("Chi-square test skipped: not enough expected counts ≥ 5.")


In [None]:
# Tabla para los OR

tabla = {
    1: [["MS_GRD", "LOS = 12", "LOS = 24"]],
    2: [["MS_GRD", "LOS = 12", "LOS = 24"]],
    3: [["MS_GRD", "LOS = 12", "LOS = 24"]]
}

for i in range(1,4):
    for grd in range(1,9):
        for unidad in ["OR"]:
            # ---- 1. Filter your dataset
            tl_u = tl[tl["UNIDAD"].isin(["ICU", "OR", "SDU_WARD"])]
            v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd) & (tl_u["HOSPITAL"] == f"Hospital_{i}")]
            # v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd)]

            # ---- 2. Get LOS value counts
            vector = v1["LOS"].value_counts().reset_index().sort_values(by="LOS")
            vector["%"] = vector["count"] / vector["count"].sum()
            vector1 = vector[["LOS", "%"]].reset_index()

            los_12 = 0
            los_24 = 0
            for index, row in vector1.iterrows():
                if row["LOS"] == 12:
                    los_12 = row["%"]
                elif row["LOS"] == 24:
                    los_24 = row["%"]

            tabla[i].append([grd, round(los_12,5), round(los_24,5)])
                    
            

    display(tabla[i])



In [None]:
# Pasar a tablas de latex
for key, h in tabla.items():

    texto = f"""
\\begin{{table}}[H]
    \\centering
    \\begin{{tabular}}{{ccc}}
        \\toprule
        MS\_GRD & LOS = 12 & LOS = 24 \\\\
        \\midrule
        {h[1][0]} & {h[1][1]} & {h[1][2]} \\\\
        {h[2][0]} & {h[2][1]} & {h[2][2]} \\\\
        {h[3][0]} & {h[3][1]} & {h[3][2]} \\\\
        {h[4][0]} & {h[4][1]} & {h[4][2]} \\\\
        {h[5][0]} & {h[5][1]} & {h[5][2]} \\\\
        {h[6][0]} & {h[6][1]} & {h[6][2]} \\\\
        {h[7][0]} & {h[7][1]} & {h[7][2]} \\\\
        {h[8][0]} & {h[8][1]} & {h[8][2]} \\\\
        \\bottomrule
    \\end{{tabular}}
    \\caption{{Probabilidad LOS en OR para Hospital: {key}}}
    \\label{{tab:Probabilidad LOS en OR para Hospital: {key}}}
\\end{{table}}
    """

    print(texto)

In [None]:
# Segundo intento (quedo pesimo)

# for i in range(1,4):
# for grd in range(1,9):
# for unidad in ["ICU", "SDU_WARD"]:

unidad = "ICU"  # Cambiar unidad
grd = 1

# ---- 1. Filtra los datos para obtener LOS y cantidad de ocurrencias
tl_u = tl[tl["UNIDAD"].isin(["ICU", "OR", "SDU_WARD"])]
# v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd) & (tl_u["HOSPITAL"] == f"Hospital_{i}")]
v1 = tl_u[(tl_u["UNIDAD"] == unidad) & (tl_u["MS_GRD"] == grd)]

# ---- 2. Calcular la cantidad de veces que se repite ese LOS
vector = v1["LOS"].value_counts().reset_index().sort_values(by="LOS")
vector.columns = ["LOS", "count"]

# ---- 3. Pasarlo a formato para histograma
los = np.array(vector["LOS"])
ocurrencias = np.array(vector["count"])
raw_data = np.repeat(los, ocurrencias) # multiplica LOS por cantidad de repeticiones

# ---- 4. Definir los bins de 12 horas (para ajustar la curva)
bin_width = 12
max_los = np.max(raw_data) # Elige el maximo de LOS
edges = np.arange(0, max_los + bin_width, bin_width) # Define los bins
midpoints = (edges[:-1] + edges[1:]) / 2  # Define puntos medios de los bins

# ---- 5. Meter datos a bins individuales (0 - 12], (12 - 24], ...) agrega 0 si no hay datos para ese bin
bin_indices = np.digitize(raw_data, edges, right=True)
hist = np.array([(bin_indices == i).sum() for i in range(1, len(edges))])
bin_edges = edges
bin_upper_bounds = bin_edges[1:]

# ---- 6. Ajustar distribución a los datos en bins de 12 horas
shape, loc, scale = stats.lognorm.fit(raw_data, floc = 0)

# # Grafico para la curva suave log_normal
# x = np.linspace(min(bin_upper_bounds), max(bin_upper_bounds), 1000)
# y = stats.lognorm.pdf(x, shape, loc, scale) * len(raw_data) * bin_width
# plt.hist(raw_data, bins=bin_edges, alpha=0.6, color='g', label="Histogram")
# plt.plot(x, y, label="Fitted Log-normal", color='r')
# plt.xlabel('Value')
# plt.ylabel('Density')
# plt.legend()
# plt.show()

# Define interval of interest
  # for example
N = len(raw_data)
maximo = int(raw_data[-1])
maximo = int(maximo/12)
expected = []
for i in range(0, maximo):
    a = i * 12
    b = (i + 1) * 12
    p_interval = stats.lognorm.cdf(b, shape, loc, scale) - stats.lognorm.cdf(a, shape, loc, scale)
    expected_occurrences = N * p_interval
    expected.append(expected_occurrences)
esperado = np.array(expected)
observado = hist

# Test de chi cuadrado
mask = esperado >= 5

obs = observado[mask]
exp = esperado[mask]

exp *= obs.sum() / exp.sum()

chi2, p = stats.chisquare(f_obs=obs, f_exp=exp)
print(f"chi2 = {chi2}, p = {p}")

In [None]:
los = [12, 24, 36, 48, 60, 72]
count = [5, 12, 8, 6, 4, 2]

def continuous(los, count):
    nueva_lista = []
    lista_cuartos = []
    for i in range(len(los)):
        # for j in range(count[i]):
        #     nueva_lista.append(los[i])
        nueva_lista.append(round((count[i])/4, 2))
    for cuarto in nueva_lista:
        lista_cuartos.append(cuarto)
        lista_cuartos.append(cuarto * 2)
        lista_cuartos.append(cuarto)
    
    uniones = []
    for i in range(0, len(lista_cuartos), 3):
        if i == 0:
            uniones.append(lista_cuartos[i])
            uniones.append(lista_cuartos[i + 1])
        elif i == len(lista_cuartos) - 3:
            uniones.append(lista_cuartos[i] + lista_cuartos[i - 1])
            uniones.append(lista_cuartos[i + 1])
            uniones.append(lista_cuartos[i + 2])
        else:
            uniones.append(lista_cuartos[i] + lista_cuartos[i - 1])
            uniones.append(lista_cuartos[i + 1])
    
    count = uniones.copy()

    new_los = []
    for i in range(len(uniones)):
        if i == 0:
            new_los.append(6)
        elif i == len(uniones) - 1:
            ultimo = new_los[-1]
            new_los.append(ultimo + 6)
        else:
            nuevo = new_los[-1] + 6
            new_los.append(nuevo)
    
    return new_los, count

uniones, count = continuous(los, count)

print(uniones)
print(count)




In [None]:
# Parametros para cuando sea parte de una función
plot = True
bandwidth = 0.82
###################### Datos a analizar ######################
tl_u = tl[tl["UNIDAD"].isin(["ICU", "OR", "SDU_WARD"])]
v1 = tl_u[(tl_u["UNIDAD"] == "SDU_WARD") & (tl_u["MS_GRD"] == 1) & (tl_u["HOSPITAL"] == f"Hospital_{1}")]
vector = v1["LOS"].value_counts().reset_index().sort_values(by="LOS")
vector.columns = ["LOS", "count"]
los = np.array(vector["LOS"])/12 
ocurrencias = np.array(vector["count"])
raw_data = np.repeat(los, ocurrencias)
##############################################################

# Estimador del kernerl density discreto #####################
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(raw_data[:, None])
##############################################################

if plot:
    ###################### Plot ##################################
    # Histograma de los datos, tantos bins como el valor maximo de los datos
    plt.figure(figsize=(10, 3)) # tamaño grafico
    plt.hist(raw_data, bins=int(raw_data[-1] - (raw_data[0] - 1)), density=True, alpha=0.6, color='g', edgecolor='black', label='Histogram')

    # pdf del KDE ajustado a los datos
    x_vals = np.linspace(min(raw_data), max(raw_data), 1000)
    log_dens = kde.score_samples(x_vals[:, None])
    kde_pdf = np.exp(log_dens)
    plt.plot(x_vals, kde_pdf, color='black', label=f'KDE (Bandwidth = {round(bandwidth,2)})')
    # puntos de las intersecciones
    x_vals_puntos = np.linspace(int(min(raw_data)), int(max(raw_data)), int(max(raw_data)) - int(min(raw_data)) + 1)
    log_dens_puntos = kde.score_samples(x_vals_puntos[:, None])
    kde_pdf_puntos = np.exp(log_dens_puntos)
    plt.scatter(x_vals_puntos, kde_pdf_puntos, color='red', label=f'KDE intersecciones')


    # Leyendas y nombres
    plt.xlabel('LOS')
    plt.ylabel('Densidad')
    plt.title('Histograma y KDE discreto de LOS')
    plt.legend()
    plt.grid(True)
    plt.show()
    ##############################################################

metricas, _ = calculate_kde_metrics(los, ocurrencias, kde)
display(pd.DataFrame(metricas))

In [None]:
# Puntos de intersección entre data y kde (en los enteros: 1, 2, 3 ...)
x_vals = np.linspace(int(min(los)), int(max(los)), int(max(los)))
log_dens = kde.score_samples(x_vals[:, None])
kde_pdf = np.exp(log_dens)

# Almaceno el los y sus ocurrencias en un diccionario
dict_temporal = {}
for i in range(len(los)):
    dict_temporal[int(los[i])] = ocurrencias[i]

# Genero ocurrencias nuevamente pero agregando un cero en los intervalos sin ocurrencias
ocurrencias_con_cero = []
for i in range(1, int(max(los)) + 1):
    ocurrencias_con_cero.append(dict_temporal.get(i, 0))
ocurrencias_con_cero = np.array(ocurrencias_con_cero)

# Calculo las métricas para el kde (que es el unico que voy a usar)
def get_metrics(y_true, y_pred):
    prediccion = y_pred * (y_true.sum() / y_pred.sum()) # reescala predicciones para el test
    chi2 = chisquare(f_obs=y_true, f_exp=prediccion)
    ks_stat = ks_2samp(y_true, y_pred)
    cc= pearsonr(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)

    return {
        'Chi2': {"Value": round(chi2.statistic, 3), "p-value": round(chi2.pvalue, 3)},
        'KS': {"Value": round(ks_stat.statistic, 3), "p-value": round(ks_stat.pvalue, 3)},
        'CC': {"Value": round(cc.statistic, 3), "p-value": round(cc.pvalue, 3)},
        'R2': {"Value": round(r2, 3), "p-value": "NA"},
        'RMSE': {"Value": round(rmse, 3), "p-value": "NA"},
        'MAE': {"Value": round(mae, 3), "p-value": "NA"}
    }

# Llamo a la función, se utilizan ocurrencias, no probabilidades
metrics = get_metrics(ocurrencias_con_cero, kde_pdf*ocurrencias.sum())

# Se muestran las metricas
display(pd.DataFrame(metrics))