In [2]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import time

from ReduMetrics.metrics.ulse import ulse_score_sklearn
from ReduMetrics.metrics.rta import rta_score
from ReduMetrics.metrics.spearman import spearman_correlation
from ReduMetrics.metrics.k_ncp import kncp_score
from ReduMetrics.metrics.cdc import cdc_score

original = pd.read_csv('../data/fashion_mnist_flattened.csv', header=None)
pca = pd.read_csv('../data/fashion_mnist_pca.csv')
tsne = pd.read_csv('../data/fashion_mnist_tsne.csv')
umap = pd.read_csv('../data/fashion_mnist_umap.csv')


labels = original[original.columns[-1]]
features = original.iloc[:, :-1]

In [3]:
# %% [markdown]
# ## 6.2 · Comparativa PCA / t-SNE / UMAP en Fashion-MNIST (setup)

# %%
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


# Carpetas de salida
FIGDIR = "figuras"
RESDIR = "resultados"
os.makedirs(FIGDIR, exist_ok=True)
os.makedirs(RESDIR, exist_ok=True)

# Parámetros globales (ajusta si lo ves necesario)
SEED = 42
SUBSAMPLE = 10_000        # tamaño del subconjunto estratificado
K_ULSE = 10               # ULSE
T_RTA = 20_000            # RTA
P_SPEARMAN = 20_000       # Spearman
K_NCP = 3                 # con 10 clases → (C+2)//4 = 3

rng = np.random.default_rng(SEED)

def standardize_columns(X: np.ndarray) -> np.ndarray:
    """Estandarización por columna: media 0, desviación 1 (evita sesgos de escala)."""
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True)
    sd[sd == 0] = 1.0
    return (X - mu) / sd

def stratified_subsample(X: np.ndarray, y: np.ndarray, size: int, seed: int):
    """Submuestreo estratificado (devuelve X_sub, y_sub, idx_sub)."""
    size = min(size, len(y))
    splitter = StratifiedShuffleSplit(n_splits=1, train_size=size, random_state=seed)
    (idx_sub, _), = splitter.split(X, y)
    idx_sub = np.sort(idx_sub)
    return X[idx_sub], y[idx_sub], idx_sub

def scatter2d(ax, emb2d: np.ndarray, labels: np.ndarray, title: str):
    ax.scatter(emb2d[:, 0], emb2d[:, 1], c=labels, s=3)
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])

def savefig(path, fig=None, dpi=300):
    if fig is None:
        plt.savefig(path, dpi=dpi, bbox_inches="tight")
    else:
        fig.savefig(path, dpi=dpi, bbox_inches="tight")
    plt.close(fig if fig is not None else plt.gcf())


In [4]:
# %% [markdown]
# ### Submuestreo estratificado y preparación de datos

# %%
# Asegurar alineación por índice entre tablas: deben tener mismo número de filas
n0 = len(original)
assert len(pca) == n0 and len(tsne) == n0 and len(umap) == n0, "Los CSV no están alineados."

# Extraer features y etiquetas del original
labels_full = labels.to_numpy().astype(int)
X_high_full = features.to_numpy().astype(float)

# Submuestreo estratificado
X_high, y_sub, idx_sub = stratified_subsample(X_high_full, labels_full, SUBSAMPLE, SEED)

# Estandarización del espacio alto (recomendado con distancia euclídea)
X_high_std = standardize_columns(X_high)

# Embeddings 2D ya calculados (selección por los mismos índices)
# Columnas esperadas según tu generación:
# pca: ['PC1','PC2','Class']; tsne: ['t-SNE1','t-SNE2','Class']; umap: ['UMAP1','UMAP2','Class']
X_pca_2d  = pca.loc[idx_sub, ['PC1', 'PC2']].to_numpy().astype(float)
X_tsne_2d = tsne.loc[idx_sub, ['t-SNE1', 't-SNE2']].to_numpy().astype(float)
X_umap_2d = umap.loc[idx_sub, ['UMAP1', 'UMAP2']].to_numpy().astype(float)

# Etiquetas comprobadas (por si los CSV tuvieran 'Class')
y_chk = pca.loc[idx_sub, 'Class'].to_numpy().astype(int)
assert np.all(y_chk == y_sub), "Las etiquetas no coinciden entre original y PCA."


In [5]:
# %% [markdown]
# ### Vista previa: dispersión de PCA / t-SNE / UMAP (subconjunto)

# %%
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
scatter2d(axs[0], X_pca_2d,  y_sub, "PCA (2D)")
scatter2d(axs[1], X_tsne_2d, y_sub, "t-SNE (2D)")
scatter2d(axs[2], X_umap_2d, y_sub, "UMAP (2D)")
plt.tight_layout()
savefig(os.path.join(FIGDIR, "6_2_dispersion_triple.png"), fig)


In [6]:
# %% [markdown]
# ### Cálculo de métricas (ULSE, RTA, Spearman, k-NCP, CDC) por técnica

# %%
def compute_all_metrics(X_high_std, X_low_2d, labels, seed=SEED):
    out = {}
    t0 = time.time()
    out["ULSE"] = float(ulse_score_sklearn(X_high_std, X_low_2d, k=K_ULSE))
    t1 = time.time()
    out["RTA"]  = float(rta_score(X_high_std, X_low_2d, T=T_RTA, random_state=seed))
    t2 = time.time()
    out["Spearman"] = float(spearman_correlation(X_high_std, X_low_2d, P=P_SPEARMAN, random_state=seed))
    t3 = time.time()
    out["k-NCP"] = float(kncp_score(X_high_std, X_low_2d, labels))
    t4 = time.time()
    out["CDC"]   = float(cdc_score(X_high_std, X_low_2d, labels))
    t5 = time.time()
    out["time_ULSE"] = t1 - t0
    out["time_RTA"] = t2 - t1
    out["time_Spearman"] = t3 - t2
    out["time_kNCP"] = t4 - t3
    out["time_CDC"] = t5 - t4
    out["time_total_metrics"] = t5 - t0
    return out

results = {
    "PCA":  compute_all_metrics(X_high_std, X_pca_2d,  y_sub, seed=SEED),
    "t-SNE":compute_all_metrics(X_high_std, X_tsne_2d, y_sub, seed=SEED),
    "UMAP": compute_all_metrics(X_high_std, X_umap_2d, y_sub, seed=SEED),
}

df_metrics = pd.DataFrame(results).T[["ULSE","RTA","Spearman","k-NCP","CDC"]]
df_times   = pd.DataFrame(results).T[["time_ULSE","time_RTA","time_Spearman","time_kNCP","time_CDC","time_total_metrics"]]

display(df_metrics)
display(df_times)

df_metrics.to_csv(os.path.join(RESDIR, "6_2_metricas.csv"))
df_times.to_csv(os.path.join(RESDIR, "6_2_tiempos_metricas.csv"))

# Gráfico de barras comparativo (técnica × métricas)
fig, ax = plt.subplots(figsize=(7, 4))
x = np.arange(len(df_metrics.columns))
width = 0.25
for i, method in enumerate(df_metrics.index):
    ax.bar(x + i*width - width, df_metrics.loc[method].values.astype(float), width, label=method)
ax.set_xticks(x)
ax.set_xticklabels(df_metrics.columns)
ax.set_ylim(0, 1)
ax.set_ylabel("puntuación")
ax.set_title("6.2 · Comparativa de métricas por técnica (subconjunto)")
ax.legend()
savefig(os.path.join(FIGDIR, "6_2_barras_metricas.png"), fig)


Unnamed: 0,ULSE,RTA,Spearman,k-NCP,CDC
PCA,0.04285,0.8052,0.69956,0.966667,0.958762
t-SNE,0.28955,0.70815,0.524849,0.833333,0.851383
UMAP,0.19124,0.68865,0.443458,0.8,0.857444


Unnamed: 0,time_ULSE,time_RTA,time_Spearman,time_kNCP,time_CDC,time_total_metrics
PCA,2.507081,0.447407,0.252228,0.04204,0.039034,3.287789
t-SNE,1.255247,0.411373,0.203184,0.029029,0.026021,1.924855
UMAP,1.084749,0.460689,0.257234,0.038034,0.041037,1.881742


In [7]:
# %% [markdown]
# ### Guardado individual de dispersión (por técnica)

# %%
fig, ax = plt.subplots(figsize=(4, 4))
scatter2d(ax, X_pca_2d, y_sub, "PCA (2D)")
plt.tight_layout()
savefig(os.path.join(FIGDIR, "6_2_dispersion_pca.png"), fig)

fig, ax = plt.subplots(figsize=(4, 4))
scatter2d(ax, X_tsne_2d, y_sub, "t-SNE (2D)")
plt.tight_layout()
savefig(os.path.join(FIGDIR, "6_2_dispersion_tsne.png"), fig)

fig, ax = plt.subplots(figsize=(4, 4))
scatter2d(ax, X_umap_2d, y_sub, "UMAP (2D)")
plt.tight_layout()
savefig(os.path.join(FIGDIR, "6_2_dispersion_umap.png"), fig)

print("Figuras y resultados guardados en:", os.path.abspath(FIGDIR), "y", os.path.abspath(RESDIR))


Figuras y resultados guardados en: c:\Users\david\Desktop\David\FIB\ReduMetrics\tests\experiments\figuras y c:\Users\david\Desktop\David\FIB\ReduMetrics\tests\experiments\resultados
