### PyPPUR

In [4]:
"""
Comparison of PCA with Projection Pursuit using pyppur on multiple synthetic nonlinear datasets.
Simplified to produce a clear summary of metrics without plotting.
"""
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons, make_s_curve, make_swiss_roll
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from pyppur.projection_pursuit import ProjectionPursuit
from pyppur.utils.metrics import evaluate_embedding


def create_synthetic_data(
    data_type="swiss_roll", n_samples=1000, noise=0.1, n_ambient_dims=50
):
    if data_type == "swiss_roll":
        X, colors = make_swiss_roll(n_samples=n_samples, noise=noise, random_state=42)
    elif data_type == "s_curve":
        X, colors = make_s_curve(n_samples=n_samples, noise=noise, random_state=42)
    elif data_type == "moons":
        X, colors = make_moons(n_samples=n_samples, noise=noise, random_state=42)
        colors = colors.astype(float)
    else:
        raise ValueError(f"Unsupported data type: {data_type}")

    labels = np.zeros(n_samples)
    labels[colors < np.percentile(colors, 33)] = 0
    labels[
        (colors >= np.percentile(colors, 33)) & (colors < np.percentile(colors, 66))
    ] = 1
    labels[colors >= np.percentile(colors, 66)] = 2

    X_high = np.zeros((n_samples, n_ambient_dims))
    X_high[:, : X.shape[1]] = X
    X_high[:, X.shape[1] :] = noise * np.random.randn(
        n_samples, n_ambient_dims - X.shape[1]
    )

    return X_high, labels.astype(int)


def compare_pca_with_projection_pursuit(data_type="swiss_roll"):
    # Generate synthetic data
    X, y = create_synthetic_data(data_type=data_type)

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    # Projection pursuit with distance distortion
    pp_dist = ProjectionPursuit(
        n_components=2,
        objective="distance_distortion",
        alpha=1.5,
        n_init=3,
        random_state=42,
    )
    X_pp_dist = pp_dist.fit_transform(X_scaled)

    # Projection pursuit with reconstruction loss
    pp_recon = ProjectionPursuit(
        n_components=2, objective="reconstruction", alpha=1.5, n_init=3, random_state=42
    )
    X_pp_recon = pp_recon.fit_transform(X_scaled)

    # Evaluate embeddings
    metrics_pca = evaluate_embedding(X_scaled, X_pca, labels=y)
    metrics_dist = evaluate_embedding(X_scaled, X_pp_dist, labels=y)
    metrics_recon = evaluate_embedding(X_scaled, X_pp_recon, labels=y)

    summary = {
        "Dataset": data_type,
        "PCA_Trust": metrics_pca["trustworthiness"],
        "PCA_Distortion": metrics_pca["distance_distortion"],
        "PCA_Silhouette": metrics_pca["silhouette"],
        "PP_Distance_Trust": metrics_dist["trustworthiness"],
        "PP_Distance_Distortion": metrics_dist["distance_distortion"],
        "PP_Distance_Silhouette": metrics_dist["silhouette"],
        "PP_Recon_Trust": metrics_recon["trustworthiness"],
        "PP_Recon_Distortion": metrics_recon["distance_distortion"],
        "PP_Recon_Silhouette": metrics_recon["silhouette"],
    }

    return summary


if __name__ == "__main__":
    results = []
    for dataset in ["swiss_roll", "s_curve", "moons"]:
        print(f"Evaluating dataset: {dataset}")
        result = compare_pca_with_projection_pursuit(data_type=dataset)
        results.append(result)

    results_df = pd.DataFrame(results)
    print("\nSummary of Results:")
    print(results_df.round(4))

Evaluating dataset: swiss_roll
Evaluating dataset: s_curve
Evaluating dataset: moons

Summary of Results:
      Dataset  PCA_Trust  PCA_Distortion  PCA_Silhouette  PP_Distance_Trust  \
0  swiss_roll     0.5917         62.8941         -0.0182             0.5821   
1     s_curve     0.5962         62.8573         -0.0261             0.5868   
2       moons     0.5997         61.7778          0.2364             0.5952   

   PP_Distance_Distortion  PP_Distance_Silhouette  PP_Recon_Trust  \
0                 73.4889                 -0.0133          0.5987   
1                 73.3956                 -0.0222          0.5959   
2                 72.8389                  0.3138          0.5952   

   PP_Recon_Distortion  PP_Recon_Silhouette  
0              73.9279              -0.0211  
1              73.9199              -0.0238  
2              73.1381               0.3070  
