In [None]:
# Import libraries
import sys
sys.path.append('./src')

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

from pca_analysis import PCAAnalyzer
from feature_selection import FeatureSelector, compare_feature_selectors
from manifold_learning import ManifoldReducer

print("âœ… All modules imported successfully!")

## Step 1: Generate High-Dimensional Data

In [None]:
# Generate synthetic dataset (e.g., sensor data)
X, y = make_classification(
    n_samples=500,
    n_features=50,  # High-dimensional
    n_informative=15,
    n_redundant=20,
    n_repeated=5,
    n_clusters_per_class=3,
    random_state=42
)

feature_names = [f"sensor_{i+1}" for i in range(50)]
df = pd.DataFrame(X, columns=feature_names)

print(f"Dataset shape: {X.shape}")
print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")

## Step 2: PCA - Dimensionality Reduction

In [None]:
# Perform PCA
pca = PCAAnalyzer(n_components=0.85)  # Retain 85% variance
X_pca = pca.fit_transform(df)

n_components = pca.find_optimal_components(threshold=0.85)

print(f"\nðŸ“Š PCA Results:")
print(f"Original dimensions: {X.shape[1]}")
print(f"Reduced dimensions: {X_pca.shape[1]}")
print(f"Dimension reduction: {(1 - X_pca.shape[1]/X.shape[1])*100:.1f}%")
print(f"Variance retained: {pca.get_cumulative_variance()[X_pca.shape[1]-1]*100:.1f}%")

# Show top features for PC1
print("\nðŸŽ¯ Top 5 Features in Principal Component 1:")
loadings = pca.get_component_loadings(0)
for idx, row in loadings.head(5).iterrows():
    print(f"  {row['feature']}: {row['loading']:.3f}")

## Step 3: Feature Selection

In [None]:
# Compare feature selection methods
comparison = compare_feature_selectors(df, y, task='classification', n_features=15)

# Show features selected by multiple methods (consensus)
consensus_features = comparison[comparison['n_methods'] >= 3].index.tolist()

print(f"\nðŸ“Š Feature Selection Results:")
print(f"Features selected by â‰¥3 methods: {len(consensus_features)}")
print("\nConsensus Features:")
for feature in consensus_features[:10]:
    methods_selected = comparison.loc[feature, ['univariate', 'mutual_info', 'rfe', 'lasso', 'tree']].sum()
    print(f"  {feature}: selected by {methods_selected}/5 methods")

# Use mutual information selector
selector = FeatureSelector(method='mutual_info', task='classification', n_features=15)
selector.fit(df, y)
selected_features = selector.get_selected_features()

print(f"\nâœ… Selected {len(selected_features)} features using mutual information")

## Step 4: t-SNE Visualization

In [None]:
# Apply t-SNE for visualization
tsne = ManifoldReducer(method='tsne', n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

print("\nðŸ“Š t-SNE Results:")
print(f"Original: {X.shape[1]}D â†’ Reduced: {X_tsne.shape[1]}D")
print("\nðŸ’¡ t-SNE preserves local structure - clusters in 2D reflect similarity")
print("Use tsne.plot_2d(labels=y) to visualize clusters")