# MLP Autoencoder - Grille d'Hyperparam√®tres Compl√®te
Entra√Ænement syst√©matique sur tous les datasets avec hyperparam√®tres optimaux


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
import json
from pathlib import Path
import itertools
from tqdm.auto import tqdm
import gc
import sys
sys.path.append('src')

from src.models.ae_mlp import MLPAutoencoder
from src.utils.new_preprocessing import preprocessing_dataset

warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('medium')  # Performance boost

# Configuration device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üöÄ Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


üöÄ Device: cuda
   GPU: NVIDIA GeForce RTX 3070 Laptop GPU
   Memory: 8.0 GB


## 1. Grille d'Hyperparam√®tres


In [2]:
# Grille d'hyperparam√®tres extensive mais optimis√©e
HYPERPARAMS_GRID = {
    # Architecture
    'hidden_layers': [
        (64, 32),      # Small
        (128, 64),     # Medium  
        (256, 128),    # Large
        (128, 64, 32), # Deep medium
    ],
    'latent_dims': [3, 5, 8, 32],
    
    # Regularization
    'dropout_rates': [0.0, 0.2, 0.3],
    'use_batch_norm': [False, True],
    
    # Optimization
    'learning_rates': [1e-4, 5e-4, 1e-3, 2e-3],
    'weight_decays': [0.0, 1e-6, 1e-5],
    'batch_sizes': [64, 128, 256],
    
    # Training
    'activations': ['relu', 'silu'],
    'loss_types': ['mse', 'huber'],
    'epochs': [100],  # Fixed pour temps raisonnable
    'patience': [10]
}

# Estimation du nombre total de combinaisons
total_combinations = np.prod([len(v) for v in HYPERPARAMS_GRID.values()])
print(f"üìä Total combinations: {total_combinations:,}")
print(f"‚è±Ô∏è  Estimated time (5min/model): {total_combinations * 5 / 60:.1f} hours")

# Configuration pour √©chantillonnage intelligent
MAX_COMBINATIONS_PER_DATASET = 20
print(f"üéØ Sampling to {MAX_COMBINATIONS_PER_DATASET} combinations per dataset")


üìä Total combinations: 13,824
‚è±Ô∏è  Estimated time (5min/model): 1152.0 hours
üéØ Sampling to 20 combinations per dataset


## 2. Fonctions Utilitaires


In [3]:
def create_results_directory():
    """Cr√©e le r√©pertoire de r√©sultats avec timestamp"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = Path(f"results/mlp_ae_grid_search_{timestamp}")
    results_dir.mkdir(parents=True, exist_ok=True)
    return results_dir

def sample_hyperparams(grid, n_samples=200, seed=42):
    """√âchantillonnage intelligent des hyperparam√®tres"""
    np.random.seed(seed)
    
    # G√©n√©ration de toutes les combinaisons
    keys = list(grid.keys())
    values = list(grid.values())
    all_combinations = list(itertools.product(*values))
    
    if len(all_combinations) <= n_samples:
        selected_combinations = all_combinations
    else:
        # √âchantillonnage stratifi√© pour couvrir l'espace
        selected_combinations = np.random.choice(
            len(all_combinations), 
            size=n_samples, 
            replace=False
        )
        selected_combinations = [all_combinations[i] for i in selected_combinations]
    
    # Conversion en liste de dictionnaires
    sampled_configs = []
    for combo in selected_combinations:
        config = dict(zip(keys, combo))
        sampled_configs.append(config)
    
    return sampled_configs

def evaluate_model_performance(model, X_test, scaler=None):
    """√âvaluation rapide des performances"""
    model.eval()
    device = next(model.parameters()).device
    
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test).to(device)
        X_recon, latent = model(X_test_tensor)
        
        # M√©triques de reconstruction
        mse = torch.nn.functional.mse_loss(X_recon, X_test_tensor).item()
        mae = torch.nn.functional.l1_loss(X_recon, X_test_tensor).item()
        
        # Variance expliqu√©e approximative
        total_var = torch.var(X_test_tensor).item()
        residual_var = torch.var(X_test_tensor - X_recon).item()
        explained_var = max(0, 1 - residual_var / total_var)
        
    return {
        'mse': mse,
        'mae': mae,
        'explained_variance': explained_var,
        'latent_std': torch.std(latent).item()
    }

def save_results(results, results_dir, dataset_name):
    """Sauvegarde des r√©sultats"""
    # JSON pour les m√©tadonn√©es
    json_file = results_dir / f"{dataset_name}_results.json"
    with open(json_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    # CSV pour analyse facile
    df_results = pd.DataFrame(results)
    csv_file = results_dir / f"{dataset_name}_results.csv"
    df_results.to_csv(csv_file, index=False)
    
    return df_results


## 3. Chargement et Pr√©traitement des Donn√©es


In [4]:
def load_sp500_data():
    """Charge les donn√©es S&P 500 compl√®tes"""
    log_returns = pd.read_csv("data/processed/dataset_log_returns.csv", index_col=0, parse_dates=True)
    log_returns = log_returns.dropna(how='all')
    
    # Pr√©traitement causal
    X_df, W_df, M_df = preprocessing_dataset(
        log_returns, 
        win=60, 
        min_periods=40,
        clip_val=3.0,
        min_valid_per_day=50  # Au moins 50 actions valides par jour
    )
    
    return {
        'name': 'SP500_Full',
        'X': X_df.values,
        'dates': X_df.index,
        'features': X_df.columns,
        'description': f'S&P 500 Full ({X_df.shape[0]} days, {X_df.shape[1]} stocks)'
    }

def load_sectoral_data():
    """Charge les donn√©es sectorielles"""
    sectors_data = {}
    sectors_mapping = pd.read_csv("data/raw/tickers_sectors.csv", index_col=0)
    
    # Secteurs principaux seulement (pour √©viter trop de datasets)
    main_sectors = ['Information Technology', 'Financials', 'Health Care', 
                   'Industrials', 'Consumer Discretionary']
    
    for sector in main_sectors:
        sector_path = f"data/processed/sectors/{sector.lower().replace(' ', '_')}/log_returns.csv"
        if os.path.exists(sector_path):
            sector_returns = pd.read_csv(sector_path, index_col=0, parse_dates=True)
            sector_returns = sector_returns.dropna(how='all')
            
            # Pr√©traitement sectoriel
            X_df, W_df, M_df = preprocessing_dataset(
                sector_returns,
                win=60,
                min_periods=40,
                clip_val=3.0,
                min_valid_per_day=max(5, sector_returns.shape[1] // 3)
            )
            
            sectors_data[sector] = {
                'name': f'Sector_{sector.replace(" ", "_")}',
                'X': X_df.values,
                'dates': X_df.index,
                'features': X_df.columns,
                'description': f'{sector} ({X_df.shape[0]} days, {X_df.shape[1]} stocks)'
            }
    
    return sectors_data

def load_yield_curve_data():
    """Charge et pr√©traite les donn√©es de courbe de taux"""
    from src.utils.yield_curve_data import load_preprocessed_yield_curve
    
    # Charger les donn√©es avec normalisation cross-sectionnelle (privil√©gi√©e pour KAN AE)
    df = load_preprocessed_yield_curve(
        start="2000-01-01",
        normalization="cross_section"
    )
    
    return {
        'name': 'Yield_Curve',
        'X': df.values,
        'dates': df.index,
        'features': df.columns,
        'description': f'US Yield Curve ({df.shape[0]} days, {df.shape[1]} maturities)'
    }

# Chargement de tous les datasets
print("üìÇ Loading datasets...")
datasets = {}

# S&P 500 complet
datasets['SP500_Full'] = load_sp500_data()
print(f"‚úÖ {datasets['SP500_Full']['description']}")

# Donn√©es sectorielles
sectoral_data = load_sectoral_data()
datasets.update(sectoral_data)
for sector_name, sector_data in sectoral_data.items():
    print(f"‚úÖ {sector_data['description']}")

# Courbe de taux
datasets['Yield_Curve'] = load_yield_curve_data()
print(f"‚úÖ {datasets['Yield_Curve']['description']}")

print(f"\nüéØ Total datasets: {len(datasets)}")


üìÇ Loading datasets...
‚úÖ S&P 500 Full (3734 days, 423 stocks)
‚úÖ Information Technology (3734 days, 53 stocks)
‚úÖ Financials (3734 days, 64 stocks)
‚úÖ Health Care (3734 days, 53 stocks)
‚úÖ Industrials (3734 days, 64 stocks)
‚úÖ Consumer Discretionary (3734 days, 43 stocks)
‚úÖ US Yield Curve (9303 days, 7 maturities)

üéØ Total datasets: 7


## 4. Fonction d'Entra√Ænement Principal


In [5]:
def train_mlp_ae_config(X_data, config, dataset_name, cv_splits=3):
    """Entra√Æne un MLP AE avec une configuration donn√©e"""
    results = {
        'dataset': dataset_name,
        'config': config,
        'cv_scores': [],
        'mean_score': 0,
        'std_score': 0,
        'training_time': 0,
        'status': 'success'
    }
    
    try:
        start_time = datetime.now()
        
        # Cross-validation temporelle
        tscv = TimeSeriesSplit(n_splits=cv_splits)
        cv_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X_data)):
            # Split des donn√©es
            X_train, X_val = X_data[train_idx], X_data[val_idx]
            
            # Normalisation
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            
            # Cr√©ation du mod√®le
            model = MLPAutoencoder(
                input_dim=X_train_scaled.shape[1],
                k=config['latent_dims'],
                hidden=config['hidden_layers'],
                activation=config['activations'],
                use_bn=config['use_batch_norm'],
                dropout_p=config['dropout_rates'],
                loss_type=config['loss_types']
            )
            
            # Entra√Ænement
            X_train_tensor = torch.FloatTensor(X_train_scaled)
            X_val_tensor = torch.FloatTensor(X_val_scaled)
            
            history = model.fit(
                X_train_tensor,
                # X_val=X_val_tensor,
                epochs=config['epochs'],
                batch_size=config['batch_sizes'],
                learning_rate=config['learning_rates'],
                weight_decay=config['weight_decays'],
                patience=config['patience'],
                verbose=False,  # Silencieux pour la grille
                device=device
            )
            
            # √âvaluation
            performance = evaluate_model_performance(model, X_val_scaled)
            cv_scores.append(performance['explained_variance'])
            
            # Nettoyage m√©moire
            del model, X_train_tensor, X_val_tensor
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            gc.collect()
        
        # Agr√©gation des r√©sultats CV
        results['cv_scores'] = cv_scores
        results['mean_score'] = np.mean(cv_scores)
        results['std_score'] = np.std(cv_scores)
        results['training_time'] = (datetime.now() - start_time).total_seconds()
        
    except Exception as e:
        results['status'] = 'failed'
        results['error'] = str(e)
        results['mean_score'] = -1  # Score sentinelle pour √©checs
    
    return results


## 5. Entra√Ænement Principal - Grille Compl√®te


In [6]:
# Cr√©ation du r√©pertoire de r√©sultats
results_dir = create_results_directory()
print(f"üìÅ Results directory: {results_dir}")

# √âchantillonnage des hyperparam√®tres
sampled_configs = sample_hyperparams(HYPERPARAMS_GRID, MAX_COMBINATIONS_PER_DATASET)
print(f"üé≤ Sampled {len(sampled_configs)} configurations")

# Entra√Ænement sur tous les datasets
all_results = {}
total_experiments = len(datasets) * len(sampled_configs)
experiment_count = 0

print(f"\nüöÄ Starting grid search: {total_experiments} total experiments")
print(f"‚è±Ô∏è  Estimated time: {total_experiments * 2 / 60:.1f} hours\n")

for dataset_name, dataset_info in datasets.items():
    print(f"\nüìä Processing {dataset_name}: {dataset_info['description']}")
    
    dataset_results = []
    X_data = dataset_info['X']
    
    # Progress bar pour ce dataset
    pbar = tqdm(sampled_configs, desc=f"Training {dataset_name}", leave=False)
    
    for config_idx, config in enumerate(pbar):
        experiment_count += 1
        
        # Mise √† jour de la progress bar
        pbar.set_postfix({
            'Exp': f"{experiment_count}/{total_experiments}",
            'Config': f"{config_idx+1}/{len(sampled_configs)}"
        })
        
        # Entra√Ænement
        result = train_mlp_ae_config(X_data, config, dataset_name)
        dataset_results.append(result)
        
        if (config_idx + 1) % 20 == 0:
            temp_df = save_results(dataset_results, results_dir, f"{dataset_name}_temp")
            best_score = temp_df['mean_score'].max()
            pbar.set_postfix({
                'Exp': f"{experiment_count}/{total_experiments}",
                'Best': f"{best_score:.4f}"
            })
    
    # Sauvegarde finale pour ce dataset
    df_results = save_results(dataset_results, results_dir, dataset_name)
    all_results[dataset_name] = dataset_results
    
    # R√©sum√© des performances
    successful_runs = df_results[df_results['status'] == 'success']
    if len(successful_runs) > 0:
        best_score = successful_runs['mean_score'].max()
        best_config_idx = successful_runs['mean_score'].idxmax()
        best_config = successful_runs.loc[best_config_idx, 'config']
        
        print(f"‚úÖ {dataset_name}: {len(successful_runs)}/{len(dataset_results)} successful")
        print(f"   üèÜ Best score: {best_score:.4f}")
        print(f"   ‚öôÔ∏è  Best config: latent={best_config['latent_dims']}, "
              f"hidden={best_config['hidden_layers']}, lr={best_config['learning_rates']}")
    else:
        print(f"‚ùå {dataset_name}: No successful runs")
    
    # Nettoyage m√©moire
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    gc.collect()

print(f"\nüéâ Grid search completed! Results saved in: {results_dir}")


üìÅ Results directory: results\mlp_ae_grid_search_20250902_034428
üé≤ Sampled 20 configurations

üöÄ Starting grid search: 140 total experiments
‚è±Ô∏è  Estimated time: 4.7 hours


üìä Processing SP500_Full: S&P 500 Full (3734 days, 423 stocks)


Training SP500_Full:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ SP500_Full: 14/20 successful
   üèÜ Best score: 0.2957
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Information Technology: Information Technology (3734 days, 53 stocks)


Training Information Technology:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Information Technology: 14/20 successful
   üèÜ Best score: 0.6372
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Financials: Financials (3734 days, 64 stocks)


Training Financials:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Financials: 14/20 successful
   üèÜ Best score: 0.6188
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Health Care: Health Care (3734 days, 53 stocks)


Training Health Care:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Health Care: 14/20 successful
   üèÜ Best score: 0.6470
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Industrials: Industrials (3734 days, 64 stocks)


Training Industrials:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Industrials: 14/20 successful
   üèÜ Best score: 0.5619
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Consumer Discretionary: Consumer Discretionary (3734 days, 43 stocks)


Training Consumer Discretionary:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Consumer Discretionary: 14/20 successful
   üèÜ Best score: 0.8057
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üìä Processing Yield_Curve: US Yield Curve (9303 days, 7 maturities)


Training Yield_Curve:   0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ Yield_Curve: 20/20 successful
   üèÜ Best score: 0.9650
   ‚öôÔ∏è  Best config: latent=32, hidden=(128, 64), lr=0.0005

üéâ Grid search completed! Results saved in: results\mlp_ae_grid_search_20250902_034428


## 6. Extraction et Analyse des Facteurs Latents


In [9]:
def extract_latent_factors_best_models(results_dir, datasets):
    """Extrait les facteurs latents des meilleurs mod√®les pour chaque dataset"""
    
    print("üß† Extracting latent factors from best models...")
    
    # Chargement des r√©sultats
    all_results = []
    for csv_file in results_dir.glob("*_results.csv"):
        if "temp" not in csv_file.name:
            df = pd.read_csv(csv_file)
            df['dataset'] = csv_file.stem.replace('_results', '')
            all_results.append(df)
    
    if not all_results:
        print("‚ùå No results found")
        return {}
    
    combined_df = pd.concat(all_results, ignore_index=True)
    successful_df = combined_df[combined_df['status'] == 'success']
    
    latent_factors = {}
    
    for dataset_name in datasets.keys():
        dataset_results = successful_df[successful_df['dataset'] == dataset_name]
        
        if len(dataset_results) == 0:
            print(f"‚ö†Ô∏è  No successful results for {dataset_name}")
            continue
        
        # Meilleure configuration
        best_result = dataset_results.loc[dataset_results['mean_score'].idxmax()]
        best_config = eval(best_result['config'])
        
        print(f"üîç {dataset_name}: Extracting factors (score: {best_result['mean_score']:.4f})")
        
        # Donn√©es du dataset
        X_data = datasets[dataset_name]['X']
        
        # Normalisation
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_data)
        
        # Entra√Ænement du meilleur mod√®le sur tout le dataset
        model = MLPAutoencoder(
            input_dim=X_scaled.shape[1],
            k=best_config['latent_dims'],
            hidden=best_config['hidden_layers'],
            activation=best_config['activations'],
            use_bn=best_config['use_batch_norm'],
            dropout_p=best_config['dropout_rates'],
            loss_type=best_config['loss_types']
        )
        
        # Entra√Ænement final (80/20 split)
        split_idx = int(0.8 * len(X_scaled))
        X_train_final = torch.FloatTensor(X_scaled[:split_idx])
        X_val_final = torch.FloatTensor(X_scaled[split_idx:])
        
        history = model.fit(
            X_train_final,
            # X_val=X_val_final,
            epochs=best_config['epochs'],
            batch_size=best_config['batch_sizes'],
            learning_rate=best_config['learning_rates'],
            weight_decay=best_config['weight_decays'],
            patience=best_config['patience'],
            verbose=False,
            device=device
        )
        
        # Extraction des facteurs latents
        model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_scaled).to(device)
            X_recon, latent_factors_tensor = model(X_tensor)
            
            latent_np = latent_factors_tensor.cpu().numpy()
            recon_np = X_recon.cpu().numpy()
        
        # Calcul des m√©triques de performance
        mse = np.mean((X_scaled - recon_np) ** 2)
        explained_var = 1 - np.var(X_scaled - recon_np) / np.var(X_scaled)
        
        # Stockage des r√©sultats
        latent_factors[dataset_name] = {
            'factors': latent_np,
            'reconstruction': recon_np,
            'original': X_scaled,
            'dates': datasets[dataset_name]['dates'],
            'features': datasets[dataset_name]['features'],
            'config': best_config,
            'performance': {
                'mse': mse,
                'explained_variance': explained_var,
                'cv_score': best_result['mean_score']
            },
            'scaler': scaler
        }
        
        print(f"   ‚úÖ Extracted {latent_np.shape[1]} factors, Explained Var: {explained_var:.4f}")
        
        # Nettoyage m√©moire
        del model, X_tensor, latent_factors_tensor, X_recon
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        gc.collect()
    
    return latent_factors

# Extraction des facteurs latents
latent_factors_mlp = extract_latent_factors_best_models(results_dir, datasets)


üß† Extracting latent factors from best models...
üîç SP500_Full: Extracting factors (score: 0.2957)
   ‚úÖ Extracted 32 factors, Explained Var: 0.3862
üîç Information Technology: Extracting factors (score: 0.6372)
   ‚úÖ Extracted 32 factors, Explained Var: 0.7514
üîç Financials: Extracting factors (score: 0.6188)
   ‚úÖ Extracted 32 factors, Explained Var: 0.7332
üîç Health Care: Extracting factors (score: 0.6470)
   ‚úÖ Extracted 32 factors, Explained Var: 0.7579
üîç Industrials: Extracting factors (score: 0.5619)
   ‚úÖ Extracted 32 factors, Explained Var: 0.6945
üîç Consumer Discretionary: Extracting factors (score: 0.8057)
   ‚úÖ Extracted 32 factors, Explained Var: 0.8851
üîç Yield_Curve: Extracting factors (score: 0.9650)
   ‚úÖ Extracted 32 factors, Explained Var: 1.0000


## 7. Comparaison avec PCA


In [10]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

def perform_pca_comparison(datasets, latent_factors_mlp):
    """Compare MLP AE factors with PCA"""
    
    print("üìä Performing PCA comparison...")
    
    pca_results = {}
    comparison_results = {}
    
    for dataset_name, dataset_info in datasets.items():
        if dataset_name not in latent_factors_mlp:
            print(f"‚ö†Ô∏è  Skipping {dataset_name} - no MLP results")
            continue
            
        print(f"\nüîç Analyzing {dataset_name}")
        
        # Donn√©es normalis√©es (m√™me que MLP AE)
        X_scaled = latent_factors_mlp[dataset_name]['original']
        n_components = latent_factors_mlp[dataset_name]['factors'].shape[1]
        
        # PCA avec m√™me nombre de composantes que MLP AE
        pca = PCA(n_components=n_components)
        pca_factors = pca.fit_transform(X_scaled)
        pca_reconstruction = pca.inverse_transform(pca_factors)
        
        # M√©triques PCA
        pca_mse = mean_squared_error(X_scaled, pca_reconstruction)
        pca_explained_var = 1 - np.var(X_scaled - pca_reconstruction) / np.var(X_scaled)
        
        pca_results[dataset_name] = {
            'factors': pca_factors,
            'reconstruction': pca_reconstruction,
            'components': pca.components_,
            'explained_variance_ratio': pca.explained_variance_ratio_,
            'performance': {
                'mse': pca_mse,
                'explained_variance': pca_explained_var,
                'cumulative_variance': np.cumsum(pca.explained_variance_ratio_)
            }
        }
        
        # Comparaison MLP AE vs PCA
        mlp_factors = latent_factors_mlp[dataset_name]['factors']
        mlp_performance = latent_factors_mlp[dataset_name]['performance']
        
        # Corr√©lations entre facteurs (ordre peut √™tre diff√©rent)
        factor_correlations = []
        for i in range(n_components):
            max_corr = 0
            best_j = 0
            for j in range(n_components):
                corr, _ = pearsonr(mlp_factors[:, i], pca_factors[:, j])
                if abs(corr) > abs(max_corr):
                    max_corr = corr
                    best_j = j
            factor_correlations.append((i, best_j, max_corr))
        
        # Comparaison des performances
        performance_comparison = {
            'mlp_explained_var': mlp_performance['explained_variance'],
            'pca_explained_var': pca_explained_var,
            'mlp_mse': mlp_performance['mse'],
            'pca_mse': pca_mse,
            'improvement': mlp_performance['explained_variance'] - pca_explained_var,
            'factor_correlations': factor_correlations,
            'max_correlation': max([abs(corr) for _, _, corr in factor_correlations]),
            'mean_correlation': np.mean([abs(corr) for _, _, corr in factor_correlations])
        }
        
        comparison_results[dataset_name] = performance_comparison
        
        print(f"   üìà PCA Explained Var: {pca_explained_var:.4f}")
        print(f"   ü§ñ MLP Explained Var: {mlp_performance['explained_variance']:.4f}")
        print(f"   üìä Improvement: {performance_comparison['improvement']:.4f}")
        print(f"   üîó Mean Factor Correlation: {performance_comparison['mean_correlation']:.4f}")
        
    return pca_results, comparison_results

# Comparaison avec PCA
pca_results, comparison_results = perform_pca_comparison(datasets, latent_factors_mlp)


üìä Performing PCA comparison...

üîç Analyzing SP500_Full
   üìà PCA Explained Var: 0.4056
   ü§ñ MLP Explained Var: 0.3862
   üìä Improvement: -0.0194
   üîó Mean Factor Correlation: 0.4318

üîç Analyzing Information Technology
   üìà PCA Explained Var: 0.7600
   ü§ñ MLP Explained Var: 0.7514
   üìä Improvement: -0.0086
   üîó Mean Factor Correlation: 0.4276

üîç Analyzing Financials
   üìà PCA Explained Var: 0.7444
   ü§ñ MLP Explained Var: 0.7332
   üìä Improvement: -0.0111
   üîó Mean Factor Correlation: 0.4134

üîç Analyzing Health Care
   üìà PCA Explained Var: 0.7709
   ü§ñ MLP Explained Var: 0.7579
   üìä Improvement: -0.0130
   üîó Mean Factor Correlation: 0.4206

üîç Analyzing Industrials
   üìà PCA Explained Var: 0.7053
   ü§ñ MLP Explained Var: 0.6945
   üìä Improvement: -0.0108
   üîó Mean Factor Correlation: 0.4036

üîç Analyzing Consumer Discretionary
   üìà PCA Explained Var: 0.8921
   ü§ñ MLP Explained Var: 0.8851
   üìä Improvement: -0.0

ValueError: n_components=32 must be between 0 and min(n_samples, n_features)=7 with svd_solver='covariance_eigh'

## 8. Visualisations Comparatives


In [11]:
def create_comprehensive_comparison_plots(comparison_results, pca_results, latent_factors_mlp, results_dir):
    """Cr√©e des visualisations comparatives compl√®tes MLP AE vs PCA"""
    
    # Configuration des couleurs selon les pr√©f√©rences
    colors = {
        'mlp': '#FF6B6B',      # Rouge
        'pca': '#4ECDC4',      # Vert/Teal
        'improvement': '#45B7D1', # Bleu
        'correlation': '#FECA57'  # Jaune/Orange
    }
    
    # Cr√©ation de la figure principale
    fig = plt.figure(figsize=(20, 16))
    gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
    
    # Donn√©es pour les plots
    datasets_with_results = list(comparison_results.keys())
    
    if not datasets_with_results:
        print("‚ùå No comparison data available for visualization")
        return
    
    # 1. Comparaison des performances globales
    ax1 = fig.add_subplot(gs[0, :2])
    
    mlp_scores = [comparison_results[ds]['mlp_explained_var'] for ds in datasets_with_results]
    pca_scores = [comparison_results[ds]['pca_explained_var'] for ds in datasets_with_results]
    
    x_pos = np.arange(len(datasets_with_results))
    width = 0.35
    
    bars1 = ax1.bar(x_pos - width/2, mlp_scores, width, label='MLP AE', color=colors['mlp'], alpha=0.8)
    bars2 = ax1.bar(x_pos + width/2, pca_scores, width, label='PCA', color=colors['pca'], alpha=0.8)
    
    ax1.set_xlabel('Dataset')
    ax1.set_ylabel('Explained Variance')
    ax1.set_title('MLP Autoencoder vs PCA: Explained Variance Comparison', fontweight='bold', fontsize=14)
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels([ds.replace('_', '\n') for ds in datasets_with_results], rotation=45, ha='right')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Ajout des valeurs sur les barres
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    for bar in bars2:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 2. Am√©lioration MLP AE vs PCA
    ax2 = fig.add_subplot(gs[0, 2:])
    
    improvements = [comparison_results[ds]['improvement'] for ds in datasets_with_results]
    colors_improvement = [colors['improvement'] if imp >= 0 else colors['mlp'] for imp in improvements]
    
    bars = ax2.bar(x_pos, improvements, color=colors_improvement, alpha=0.8)
    ax2.axhline(y=0, color='black', linestyle='-', alpha=0.5)
    ax2.set_xlabel('Dataset')
    ax2.set_ylabel('Improvement (MLP AE - PCA)')
    ax2.set_title('Performance Improvement: MLP AE over PCA', fontweight='bold', fontsize=14)
    ax2.set_xticks(x_pos)
    ax2.set_xticklabels([ds.replace('_', '\n') for ds in datasets_with_results], rotation=45, ha='right')
    ax2.grid(True, alpha=0.3)
    
    # Ajout des valeurs
    for bar, imp in zip(bars, improvements):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + (0.002 if height >= 0 else -0.005),
                f'{imp:+.4f}', ha='center', va='bottom' if height >= 0 else 'top', fontweight='bold')
    
    # 3. Corr√©lations entre facteurs
    ax3 = fig.add_subplot(gs[1, 0])
    
    mean_correlations = [comparison_results[ds]['mean_correlation'] for ds in datasets_with_results]
    bars = ax3.bar(x_pos, mean_correlations, color=colors['correlation'], alpha=0.8)
    ax3.set_xlabel('Dataset')
    ax3.set_ylabel('Mean |Correlation|')
    ax3.set_title('Factor Correlations\n(MLP AE vs PCA)', fontweight='bold', fontsize=12)
    ax3.set_xticks(x_pos)
    ax3.set_xticklabels([ds.replace('_', '\n') for ds in datasets_with_results], rotation=45, ha='right')
    ax3.set_ylim(0, 1)
    ax3.grid(True, alpha=0.3)
    
    for bar in bars:
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    
    # 4. Distribution des am√©liorations
    ax4 = fig.add_subplot(gs[1, 1])
    
    ax4.hist(improvements, bins=10, color=colors['improvement'], alpha=0.7, edgecolor='black')
    ax4.axvline(x=0, color='red', linestyle='--', alpha=0.8, label='No improvement')
    ax4.axvline(x=np.mean(improvements), color='green', linestyle='-', alpha=0.8, label=f'Mean: {np.mean(improvements):.4f}')
    ax4.set_xlabel('Improvement (Explained Variance)')
    ax4.set_ylabel('Frequency')
    ax4.set_title('Distribution of\nPerformance Improvements', fontweight='bold', fontsize=12)
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    # 5. Scatter plot: MLP AE vs PCA performance
    ax5 = fig.add_subplot(gs[1, 2])
    
    scatter = ax5.scatter(pca_scores, mlp_scores, c=mean_correlations, cmap='viridis', 
                         s=100, alpha=0.8, edgecolors='black', linewidth=1)
    
    # Ligne de parit√©
    min_score = min(min(pca_scores), min(mlp_scores))
    max_score = max(max(pca_scores), max(mlp_scores))
    ax5.plot([min_score, max_score], [min_score, max_score], 'r--', alpha=0.8, label='Parity line')
    
    ax5.set_xlabel('PCA Explained Variance')
    ax5.set_ylabel('MLP AE Explained Variance')
    ax5.set_title('Performance Scatter\n(colored by correlation)', fontweight='bold', fontsize=12)
    ax5.legend()
    ax5.grid(True, alpha=0.3)
    
    # Colorbar
    cbar = plt.colorbar(scatter, ax=ax5, shrink=0.8)
    cbar.set_label('Mean Factor Correlation')
    
    # 6. Nombre de facteurs latents par dataset
    ax6 = fig.add_subplot(gs[1, 3])
    
    n_factors = [latent_factors_mlp[ds]['factors'].shape[1] for ds in datasets_with_results]
    bars = ax6.bar(x_pos, n_factors, color=colors['mlp'], alpha=0.8)
    ax6.set_xlabel('Dataset')
    ax6.set_ylabel('Number of Latent Factors')
    ax6.set_title('Latent Dimensions\nper Dataset', fontweight='bold', fontsize=12)
    ax6.set_xticks(x_pos)
    ax6.set_xticklabels([ds.replace('_', '\n') for ds in datasets_with_results], rotation=45, ha='right')
    ax6.grid(True, alpha=0.3)
    
    for bar in bars:
        height = bar.get_height()
        ax6.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{int(height)}', ha='center', va='bottom', fontweight='bold')
    
    # 7-10. √âvolution temporelle des facteurs pour datasets s√©lectionn√©s
    selected_datasets = list(datasets_with_results)[:4]  # 4 premiers datasets
    
    for idx, dataset_name in enumerate(selected_datasets):
        ax = fig.add_subplot(gs[2 + idx//2, (idx%2)*2:(idx%2)*2+2])
        
        # Facteurs MLP AE
        mlp_factors = latent_factors_mlp[dataset_name]['factors']
        dates = latent_factors_mlp[dataset_name]['dates']
        
        # Facteurs PCA
        pca_factors = pca_results[dataset_name]['factors']
        
        # Plot des 3 premiers facteurs de chaque m√©thode
        n_factors_to_plot = min(3, mlp_factors.shape[1])
        
        for i in range(n_factors_to_plot):
            ax.plot(dates[:len(mlp_factors)], mlp_factors[:, i], 
                   label=f'MLP F{i+1}', color=colors['mlp'], alpha=0.7, linewidth=1.5)
            ax.plot(dates[:len(pca_factors)], pca_factors[:, i], 
                   label=f'PCA F{i+1}', color=colors['pca'], alpha=0.7, linestyle='--', linewidth=1)
        
        ax.set_title(f'{dataset_name.replace("_", " ")} - Factor Evolution', fontweight='bold', fontsize=11)
        ax.set_xlabel('Time')
        ax.set_ylabel('Factor Value')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
        ax.grid(True, alpha=0.3)
        
        # Rotation des dates
        ax.tick_params(axis='x', rotation=45)
    
    # Titre principal
    fig.suptitle('MLP Autoencoder vs PCA: Comprehensive Comparison Analysis', 
                fontsize=18, fontweight='bold', y=0.98)
    
    # Sauvegarde
    viz_file = results_dir / "mlp_ae_vs_pca_comprehensive_analysis.png"
    plt.savefig(viz_file, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
    print(f"üìä Comprehensive visualization saved to: {viz_file}")
    
    plt.tight_layout()
    plt.show()
    
    # R√©sum√© statistique
    print("\n" + "="*80)
    print("üìà MLP AUTOENCODER vs PCA - STATISTICAL SUMMARY")
    print("="*80)
    print(f"üìä Datasets analyzed: {len(datasets_with_results)}")
    print(f"üèÜ MLP AE wins: {sum(1 for imp in improvements if imp > 0)} / {len(improvements)}")
    print(f"üìà Average improvement: {np.mean(improvements):.4f} ¬± {np.std(improvements):.4f}")
    print(f"üîó Average factor correlation: {np.mean(mean_correlations):.4f} ¬± {np.std(mean_correlations):.4f}")
    print(f"üìä Best improvement: {max(improvements):.4f} ({datasets_with_results[improvements.index(max(improvements))]})")
    print(f"üìä Worst performance: {min(improvements):.4f} ({datasets_with_results[improvements.index(min(improvements))]})")
    print("="*80)

# Cr√©ation des visualisations comparatives
if comparison_results and pca_results:
    create_comprehensive_comparison_plots(comparison_results, pca_results, latent_factors_mlp, results_dir)
else:
    print("‚ö†Ô∏è  No comparison data available for visualization")


NameError: name 'comparison_results' is not defined

## 9. Sauvegarde des R√©sultats Finaux


In [12]:
# Sauvegarde compl√®te des r√©sultats pour comparaison future avec KAN AE
final_results = {
    'mlp_ae_factors': latent_factors_mlp,
    'pca_results': pca_results,
    'comparison_results': comparison_results,
    'datasets_info': {name: {'shape': info['X'].shape, 'description': info['description']} 
                     for name, info in datasets.items()},
    'timestamp': datetime.now().isoformat(),
    'hyperparams_tested': len(sampled_configs),
    'total_experiments': len(datasets) * len(sampled_configs)
}

# Sauvegarde JSON pour m√©tadonn√©es
final_results_file = results_dir / "mlp_ae_final_results.json"
with open(final_results_file, 'w') as f:
    # Conversion des arrays numpy en listes pour JSON
    json_safe_results = {}
    for key, value in final_results.items():
        if key in ['mlp_ae_factors', 'pca_results']:
            json_safe_results[key] = {
                dataset: {
                    'performance': data.get('performance', {}),
                    'config': data.get('config', {}),
                    'factors_shape': data.get('factors', np.array([])).shape,
                    'dates_count': len(data.get('dates', []))
                }
                for dataset, data in value.items()
            }
        else:
            json_safe_results[key] = value
    
    json.dump(json_safe_results, f, indent=2, default=str)

print(f"üíæ Final results saved to: {final_results_file}")

# R√©sum√© final
print(f"\nüéâ MLP AUTOENCODER ANALYSIS COMPLETED!")
print(f"="*60)
print(f"üìä Total datasets processed: {len(datasets)}")
print(f"üß† Latent factors extracted: {len(latent_factors_mlp)}")
print(f"üìà PCA comparisons: {len(pca_results)}")
print(f"‚öôÔ∏è  Hyperparameters tested: {len(sampled_configs)}")
print(f"üìÅ Results directory: {results_dir}")
print(f"‚è∞ Completed at: {datetime.now().strftime('%H:%M:%S')}")
print(f"="*60)

if comparison_results:
    improvements = [comparison_results[ds]['improvement'] for ds in comparison_results.keys()]
    print(f"üèÜ MLP AE outperforms PCA in {sum(1 for imp in improvements if imp > 0)}/{len(improvements)} datasets")
    print(f"üìä Average improvement: {np.mean(improvements):.4f}")
    print(f"üîù Best performing dataset: {list(comparison_results.keys())[improvements.index(max(improvements))]}")
    
print(f"\n‚úÖ Ready for KAN AE comparison!")


NameError: name 'pca_results' is not defined