In [2]:
import pandas as pd
import re
import numpy as np
def select_pc_matrix(df_feats: pd.DataFrame, require_pca: bool = True) -> pd.DataFrame:
    """Seleziona e valida le colonne PC, rimuovendo righe non valide."""
    pc_cols = [c for c in df_feats.columns if PC_COL_RE.match(str(c))]

    print(f"[DEBUG] select_pc_matrix: columns = {list(df_feats.columns)}", flush=True)
    print(f"[DEBUG] Found PC columns: {pc_cols}", flush=True)

    if not pc_cols:
        print(f"[INFO] No PC columns found, using all numeric columns", flush=True)
        Xnum = df_feats.apply(pd.to_numeric, errors="coerce")
        Xnum = Xnum.loc[:, Xnum.notna().any(axis=0)]
        if Xnum.shape[1] == 0:
            raise ValueError("No numeric PCA columns found in input features.")
        if require_pca:
            Xnum.columns = [f"PC{i}" for i in range(1, Xnum.shape[1] + 1)]
        return Xnum

    # PRIMA della conversione: verifica valori raw
    X_raw = df_feats[pc_cols].copy()
    
    print(f"[INFO] Found {len(pc_cols)} PC columns: {pc_cols}", flush=True)
    print(f"[INFO] Initial shape: {X_raw.shape[0]} samples × {X_raw.shape[1]} PCs", flush=True)
    
    # DEBUG: mostra sample dei dati RAW
    print(f"\n[DEBUG] First 3 rows of PC1 (RAW, before conversion):", flush=True)
    print(X_raw['PC1'].head(3).tolist(), flush=True)
    print(f"[DEBUG] Data types before conversion: {X_raw.dtypes.unique()}", flush=True)
    
    # Controlla valori problematici PRIMA della conversione
    for col in pc_cols[:3]:  # solo prime 3 PC per non spammare
        unique_vals = X_raw[col].unique()
        non_numeric = [v for v in unique_vals if not str(v).replace('.','',1).replace('-','',1).replace('e','',1).replace('E','',1).replace('+','',1).isdigit()]
        if non_numeric:
            print(f"[WARN] Column {col} has non-numeric values: {non_numeric[:10]}", flush=True)
    
    # converti PC a numerico
    X = X_raw.copy()
    for col in pc_cols:
        X[col] = pd.to_numeric(X[col], errors="coerce")
    
    # DOPO la conversione: conta i NaN
    nan_per_col = X.isna().sum()
    if nan_per_col.sum() > 0:
        print(f"\n[WARN] NaN counts after conversion:", flush=True)
        for col, count in nan_per_col[nan_per_col > 0].items():
            print(f"  {col}: {count} NaN values", flush=True)
        
        # Mostra esempi di righe con NaN
        rows_with_nan = X[X.isna().any(axis=1)].head(3)
        print(f"\n[DEBUG] First 3 rows with NaN (after conversion):", flush=True)
        print(rows_with_nan, flush=True)
        
        # Mostra i valori RAW corrispondenti
        print(f"\n[DEBUG] Same rows BEFORE conversion (raw values):", flush=True)
        print(X_raw.loc[rows_with_nan.index], flush=True)

    # Rimuovi righe dove TUTTE le PC sono NaN
    bad_mask = X.isna().all(axis=1)
    
    if bad_mask.any():
        n_bad = int(bad_mask.sum())
        print(f"[WARN] Removed {n_bad} row(s) with all-NaN PC values", flush=True)
        X = X.loc[~bad_mask].copy()
    
    if X.shape[0] == 0:
        raise ValueError(
            f"ERROR: All {len(df_feats)} input rows were filtered out as invalid.\n"
            f"This usually means the PCA file contains only headers or non-numeric data."
        )
    
    print(f"[INFO] Final shape: {X.shape[0]} samples × {X.shape[1]} PCs", flush=True)
    print(f"[INFO] Total NaN in final matrix: {X.isna().sum().sum()}\n", flush=True)
    
    return X
