In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import RobustScaler, PowerTransformer, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                             VotingRegressor, StackingRegressor)
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import joblib
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import skew
import os
import pickle
warnings.filterwarnings('ignore')

class ComprehensiveKFoldNoisePredictor:
    def __init__(self):
        self.models = {}
        self.best_models = {}  # 存储最佳模型
        self.scalers = {}
        self.target_transformers = {}
        self.feature_selectors = {}
        self.pca_transformers = {}
        self.feature_names = {}
        self.noise_types = ['Mean Noise', 'Low Freq', 'Mid Freq', 'High Freq']
        self.feature_types = ['street', 'remote', 'fusion']
        self.feature_names_display = {
            'street': 'Street View', 
            'remote': 'Remote Sensing', 
            'fusion': 'Fusion'
        }
        
        # 8个模型配置
        self.model_configs = self._get_all_model_configs()
        
    def _get_all_model_configs(self):
        """获取所有8个模型的配置"""
        configs = {
            'XGBoost': {
                'class': xgb.XGBRegressor,
                'params': {
                    'objective': 'reg:squarederror',
                    'max_depth': 8,
                    'learning_rate': 0.03,
                    'n_estimators': 1000,
                    'subsample': 0.85,
                    'colsample_bytree': 0.85,
                    'reg_alpha': 0.1,
                    'reg_lambda': 1.2,
                    'random_state': 42,
                    'n_jobs': -1,
                    'verbosity': 0
                },
                'param_grid': {
                    'max_depth': [6, 8, 10],
                    'learning_rate': [0.01, 0.03, 0.05],
                    'n_estimators': [800, 1000, 1200],
                    'subsample': [0.8, 0.85, 0.9],
                    'colsample_bytree': [0.8, 0.85, 0.9]
                }
            },
            'LightGBM': {
                'class': lgb.LGBMRegressor,
                'params': {
                    'objective': 'regression',
                    'max_depth': 8,
                    'learning_rate': 0.03,
                    'n_estimators': 1000,
                    'subsample': 0.85,
                    'colsample_bytree': 0.85,
                    'reg_alpha': 0.1,
                    'reg_lambda': 1.2,
                    'random_state': 42,
                    'n_jobs': -1,
                    'verbose': -1
                },
                'param_grid': {
                    'max_depth': [6, 8, 10],
                    'learning_rate': [0.01, 0.03, 0.05],
                    'n_estimators': [800, 1000, 1200],
                    'subsample': [0.8, 0.85, 0.9]
                }
            },
            'Random Forest': {
                'class': RandomForestRegressor,
                'params': {
                    'n_estimators': 300,
                    'max_depth': 20,
                    'min_samples_split': 3,
                    'min_samples_leaf': 1,
                    'max_features': 'sqrt',
                    'random_state': 42,
                    'n_jobs': -1
                },
                'param_grid': {
                    'n_estimators': [200, 300, 400],
                    'max_depth': [15, 20, 25],
                    'min_samples_split': [2, 3, 5],
                    'max_features': ['sqrt', 'log2']
                }
            },
            'Gradient Boosting': {
                'class': GradientBoostingRegressor,
                'params': {
                    'n_estimators': 300,
                    'max_depth': 8,
                    'learning_rate': 0.05,
                    'subsample': 0.85,
                    'min_samples_split': 3,
                    'random_state': 42
                },
                'param_grid': {
                    'n_estimators': [200, 300, 400],
                    'max_depth': [6, 8, 10],
                    'learning_rate': [0.03, 0.05, 0.1],
                    'subsample': [0.8, 0.85, 0.9]
                }
            },
            'SVR': {
                'class': SVR,
                'params': {
                    'kernel': 'rbf',
                    'C': 100,
                    'gamma': 'scale',
                    'epsilon': 0.05
                },
                'param_grid': {
                    'kernel': ['rbf', 'poly'],
                    'C': [50, 100, 200],
                    'gamma': ['scale', 'auto'],
                    'epsilon': [0.01, 0.05, 0.1]
                }
            },
            'KNN': {
                'class': KNeighborsRegressor,
                'params': {
                    'n_neighbors': 5,
                    'weights': 'distance',
                    'algorithm': 'auto',
                    'leaf_size': 30,
                    'p': 2,
                    'n_jobs': -1
                },
                'param_grid': {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'leaf_size': [20, 30, 40],
                    'p': [1, 2]
                }
            },
            'Lasso': {
                'class': Lasso,
                'params': {
                    'alpha': 0.1,
                    'random_state': 42,
                    'max_iter': 2000
                },
                'param_grid': {
                    'alpha': [0.001, 0.01, 0.1, 1.0],
                    'max_iter': [1000, 2000, 3000]
                }
            }
        }
        
        return configs

    def load_data(self, street_view_path, remote_sensing_path, noise_labels_path):
        """加载数据"""
        street_features = np.load(street_view_path)
        remote_features = np.load(remote_sensing_path)
        noise_labels = np.load(noise_labels_path)

        print(f"Street view features shape: {street_features.shape}")
        print(f"Remote sensing features shape: {remote_features.shape}")
        print(f"Noise labels shape: {noise_labels.shape}")

        if noise_labels.shape[1] != 4:
            raise ValueError(f"Noise labels should have 4 columns, but got {noise_labels.shape[1]}")

        # 生成特征名称
        self.feature_names['street'] = [f'street_feat_{i}' for i in range(street_features.shape[1])]
        self.feature_names['remote'] = [f'remote_feat_{i}' for i in range(remote_features.shape[1])]
        self.feature_names['fusion'] = self.feature_names['street'] + self.feature_names['remote']

        # 拼接特征用于融合模型
        fusion_features = np.concatenate([street_features, remote_features], axis=1)

        return street_features, remote_features, fusion_features, noise_labels

    def advanced_preprocessing(self, X_train, X_test, y_train):
        """高级数据预处理"""
        # 1. 异常值检测和处理
        try:
            z_scores = np.abs(stats.zscore(X_train, axis=0, nan_policy='omit'))
            outlier_threshold = 4
            outlier_mask = (z_scores < outlier_threshold).all(axis=1)
            
            if np.sum(~outlier_mask) > 0:
                X_train_clean = X_train[outlier_mask]
                y_train_clean = y_train[outlier_mask]
            else:
                X_train_clean = X_train
                y_train_clean = y_train
        except Exception as e:
            X_train_clean = X_train
            y_train_clean = y_train
        
        # 2. 使用RobustScaler
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train_clean)
        X_test_scaled = scaler.transform(X_test)
        
        # 3. 目标变量变换
        target_transformer = None
        try:
            if len(y_train_clean) > 0 and skew(y_train_clean) > 1:
                target_transformer = PowerTransformer(method='yeo-johnson')
                y_train_transformed = target_transformer.fit_transform(
                    y_train_clean.reshape(-1, 1)
                ).flatten()
            else:
                y_train_transformed = y_train_clean
        except Exception as e:
            y_train_transformed = y_train_clean
        
        return X_train_scaled, X_test_scaled, y_train_transformed, scaler, target_transformer

    def advanced_feature_engineering(self, X_train, X_test, y_train):
        """高级特征工程"""
        # 1. 移除常数特征
        constant_features = []
        for i in range(X_train.shape[1]):
            if np.var(X_train[:, i]) < 1e-10:
                constant_features.append(i)
        
        if constant_features:
            X_train = np.delete(X_train, constant_features, axis=1)
            X_test = np.delete(X_test, constant_features, axis=1)
        
        # 2. 特征选择
        feature_selector = None
        n_features_to_select = min(300, X_train.shape[1])
        if X_train.shape[1] > n_features_to_select:
            feature_selector = SelectKBest(score_func=f_regression, k=n_features_to_select)
            X_train_selected = feature_selector.fit_transform(X_train, y_train)
            X_test_selected = feature_selector.transform(X_test)
        else:
            X_train_selected = X_train
            X_test_selected = X_test
        
        # 3. PCA降维
        pca_transformer = None
        if X_train_selected.shape[1] > 100:
            try:
                pca_transformer = PCA(n_components=0.95, random_state=42)
                X_train_pca = pca_transformer.fit_transform(X_train_selected)
                X_test_pca = pca_transformer.transform(X_test_selected)
            except Exception as e:
                X_train_pca = X_train_selected
                X_test_pca = X_test_selected
                pca_transformer = None
        else:
            X_train_pca = X_train_selected
            X_test_pca = X_test_selected
        
        return X_train_pca, X_test_pca, feature_selector, pca_transformer

    def k_fold_cross_validation_all_models(self, features_dict, labels, k=5, optimize_hyperparams=False, save_best_models=True):
        """对所有模型进行K折交叉验证并保存最佳模型"""
        print(f"\n{'='*120}")
        print(f"          K-FOLD CROSS-VALIDATION FOR ALL 8 MODELS + ENSEMBLES (K={k})")
        print(f"{'='*120}")
        print(f"Models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso")
        print(f"Ensembles: Voting (7 models), Stacking (7 models + Lasso)")
        
        all_results = []
        
        # 获取特征数据
        street_features = features_dict['street']
        remote_features = features_dict['remote']
        fusion_features = features_dict['fusion']
        
        for noise_idx, noise_type in enumerate(self.noise_types):
            print(f"\n{'='*100}")
            print(f"NOISE TYPE: {noise_type}")
            print(f"{'='*100}")
            
            y_full = labels[:, noise_idx]
            
            # 初始化该噪声类型的模型存储
            if noise_type not in self.best_models:
                self.best_models[noise_type] = {}
            
            for feature_type in self.feature_types:
                print(f"\n{'-'*100}")
                print(f"Feature Type: {self.feature_names_display[feature_type]}")
                print(f"{'-'*100}")
                
                # 获取对应特征
                if feature_type == 'street':
                    X_full = street_features
                elif feature_type == 'remote':
                    X_full = remote_features
                else:  # fusion
                    X_full = fusion_features
                
                # 初始化该特征类型的模型存储
                if feature_type not in self.best_models[noise_type]:
                    self.best_models[noise_type][feature_type] = {}
                
                # 为每个基础模型进行K折验证
                base_models_results = {}
                
                for model_name in self.model_configs.keys():
                    print(f"\n  Training {model_name}...")
                    
                    kf = KFold(n_splits=k, shuffle=True, random_state=42)
                    fold_scores = {
                        'R²': [], 'RMSE': [], 'MAE': [], 'MSE': []
                    }
                    
                    fold_predictions = []
                    fold_models = []
                    fold_preprocessors = []
                    
                    for fold_num, (train_index, val_index) in enumerate(kf.split(X_full), 1):
                        X_train_fold = X_full[train_index]
                        X_val_fold = X_full[val_index]
                        y_train_fold = y_full[train_index]
                        y_val_fold = y_full[val_index]
                        
                        try:
                            # 预处理
                            X_train_scaled, X_val_scaled, y_train_transformed, scaler, target_transformer = \
                                self.advanced_preprocessing(X_train_fold, X_val_fold, y_train_fold)
                            
                            # 特征工程
                            X_train_final, X_val_final, feature_selector, pca_transformer = \
                                self.advanced_feature_engineering(X_train_scaled, X_val_scaled, y_train_transformed)
                            
                            # 训练模型
                            if optimize_hyperparams and fold_num == 1:
                                model = self.optimize_hyperparameters(
                                    X_train_final, y_train_transformed, model_name, n_iter=10
                                )
                            else:
                                model = self.model_configs[model_name]['class'](
                                    **self.model_configs[model_name]['params']
                                )
                            
                            model.fit(X_train_final, y_train_transformed)
                            fold_models.append(model)
                            
                            # 保存预处理器
                            fold_preprocessors.append({
                                'scaler': scaler,
                                'target_transformer': target_transformer,
                                'feature_selector': feature_selector,
                                'pca_transformer': pca_transformer
                            })
                            
                            # 预测
                            y_pred = model.predict(X_val_final)
                            
                            # 逆变换
                            if target_transformer:
                                y_pred = target_transformer.inverse_transform(
                                    y_pred.reshape(-1, 1)
                                ).flatten()
                            
                            # 计算指标
                            r2 = r2_score(y_val_fold, y_pred)
                            rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                            mae = mean_absolute_error(y_val_fold, y_pred)
                            mse = mean_squared_error(y_val_fold, y_pred)
                            
                            fold_scores['R²'].append(r2)
                            fold_scores['RMSE'].append(rmse)
                            fold_scores['MAE'].append(mae)
                            fold_scores['MSE'].append(mse)
                            
                            fold_predictions.append({
                                'y_true': y_val_fold,
                                'y_pred': y_pred
                            })
                            
                            print(f"    Fold {fold_num}/{k}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}")
                            
                        except Exception as e:
                            print(f"    Fold {fold_num}/{k} failed: {e}")
                            continue
                    
                    # 计算平均指标
                    if len(fold_scores['R²']) > 0:
                        mean_r2 = np.mean(fold_scores['R²'])
                        std_r2 = np.std(fold_scores['R²'])
                        mean_rmse = np.mean(fold_scores['RMSE'])
                        std_rmse = np.std(fold_scores['RMSE'])
                        mean_mae = np.mean(fold_scores['MAE'])
                        std_mae = np.std(fold_scores['MAE'])
                        
                        print(f"\n  {model_name} {k}-Fold CV Results:")
                        print(f"    R²:   {mean_r2:.4f} ± {std_r2:.4f}")
                        print(f"    RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
                        print(f"    MAE:  {mean_mae:.4f} ± {std_mae:.4f}")
                        
                        all_results.append({
                            'Noise Type': noise_type,
                            'Feature Type': self.feature_names_display[feature_type],
                            'Model': model_name,
                            'Model Type': 'Base',
                            'CV R² Mean': mean_r2,
                            'CV R² Std': std_r2,
                            'CV RMSE Mean': mean_rmse,
                            'CV RMSE Std': std_rmse,
                            'CV MAE Mean': mean_mae,
                            'CV MAE Std': std_mae,
                            'CV MSE Mean': np.mean(fold_scores['MSE']),
                            'CV MSE Std': np.std(fold_scores['MSE'])
                        })
                        
                        # 保存最佳fold的模型（基于R²）
                        best_fold_idx = np.argmax(fold_scores['R²'])
                        self.best_models[noise_type][feature_type][model_name] = {
                            'model': fold_models[best_fold_idx],
                            'preprocessors': fold_preprocessors[best_fold_idx],
                            'cv_scores': fold_scores,
                            'mean_r2': mean_r2,
                            'mean_rmse': mean_rmse,
                            'mean_mae': mean_mae
                        }
                        
                        base_models_results[model_name] = {
                            'models': fold_models,
                            'predictions': fold_predictions,
                            'scores': fold_scores
                        }
                
                # 创建集成模型并进行K折验证
                if len(base_models_results) >= 2:
                    print(f"\n  Creating Ensemble Models...")
                    
                    # Voting Ensemble
                    print(f"\n  Training Voting Ensemble...")
                    voting_result = self._k_fold_ensemble_voting(
                        X_full, y_full, k, noise_type, feature_type
                    )
                    
                    if voting_result:
                        all_results.append({
                            'Noise Type': noise_type,
                            'Feature Type': self.feature_names_display[feature_type],
                            'Model': 'Voting',
                            'Model Type': 'Ensemble',
                            'CV R² Mean': voting_result['scores']['mean_r2'],
                            'CV R² Std': voting_result['scores']['std_r2'],
                            'CV RMSE Mean': voting_result['scores']['mean_rmse'],
                            'CV RMSE Std': voting_result['scores']['std_rmse'],
                            'CV MAE Mean': voting_result['scores']['mean_mae'],
                            'CV MAE Std': voting_result['scores']['std_mae'],
                            'CV MSE Mean': voting_result['scores']['mean_mse'],
                            'CV MSE Std': voting_result['scores']['std_mse']
                        })
                        
                        # 保存Voting模型
                        self.best_models[noise_type][feature_type]['Voting'] = voting_result
                    
                    # Stacking Ensemble
                    print(f"\n  Training Stacking Ensemble...")
                    stacking_result = self._k_fold_ensemble_stacking(
                        X_full, y_full, k, noise_type, feature_type
                    )
                    
                    if stacking_result:
                        all_results.append({
                            'Noise Type': noise_type,
                            'Feature Type': self.feature_names_display[feature_type],
                            'Model': 'Stacking',
                            'Model Type': 'Ensemble',
                            'CV R² Mean': stacking_result['scores']['mean_r2'],
                            'CV R² Std': stacking_result['scores']['std_r2'],
                            'CV RMSE Mean': stacking_result['scores']['mean_rmse'],
                            'CV RMSE Std': stacking_result['scores']['std_rmse'],
                            'CV MAE Mean': stacking_result['scores']['mean_mae'],
                            'CV MAE Std': stacking_result['scores']['std_mae'],
                            'CV MSE Mean': stacking_result['scores']['mean_mse'],
                            'CV MSE Std': stacking_result['scores']['std_mse']
                        })
                        
                        # 保存Stacking模型
                        self.best_models[noise_type][feature_type]['Stacking'] = stacking_result
        
        # 创建结果DataFrame
        results_df = pd.DataFrame(all_results)
        
        # 显示结果
        self._display_kfold_results(results_df)
        
        # 保存结果
        results_df.to_csv('comprehensive_kfold_results.csv', index=False)
        print(f"\n{'='*120}")
        print(f"Results saved to 'comprehensive_kfold_results.csv'")
        
        # 保存所有最佳模型
        if save_best_models:
            self.save_all_models()
        
        print(f"{'='*120}")
        
        return results_df

    def _k_fold_ensemble_voting(self, X_full, y_full, k, noise_type, feature_type):
        """Voting集成的K折验证"""
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        fold_scores = {'R²': [], 'RMSE': [], 'MAE': [], 'MSE': []}
        fold_models = []
        fold_preprocessors = []
        
        for fold_num, (train_index, val_index) in enumerate(kf.split(X_full), 1):
            X_train_fold = X_full[train_index]
            X_val_fold = X_full[val_index]
            y_train_fold = y_full[train_index]
            y_val_fold = y_full[val_index]
            
            try:
                # 预处理
                X_train_scaled, X_val_scaled, y_train_transformed, scaler, target_transformer = \
                    self.advanced_preprocessing(X_train_fold, X_val_fold, y_train_fold)
                
                X_train_final, X_val_final, feature_selector, pca_transformer = \
                    self.advanced_feature_engineering(X_train_scaled, X_val_scaled, y_train_transformed)
                
                # 创建并训练Voting模型
                estimators = []
                for model_name in self.model_configs.keys():
                    model = self.model_configs[model_name]['class'](
                        **self.model_configs[model_name]['params']
                    )
                    estimators.append((model_name, model))
                
                voting_model = VotingRegressor(estimators=estimators)
                voting_model.fit(X_train_final, y_train_transformed)
                fold_models.append(voting_model)
                
                # 保存预处理器
                fold_preprocessors.append({
                    'scaler': scaler,
                    'target_transformer': target_transformer,
                    'feature_selector': feature_selector,
                    'pca_transformer': pca_transformer
                })
                
                # 预测
                y_pred = voting_model.predict(X_val_final)
                
                if target_transformer:
                    y_pred = target_transformer.inverse_transform(
                        y_pred.reshape(-1, 1)
                    ).flatten()
                
                # 计算指标
                r2 = r2_score(y_val_fold, y_pred)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                mae = mean_absolute_error(y_val_fold, y_pred)
                mse = mean_squared_error(y_val_fold, y_pred)
                
                fold_scores['R²'].append(r2)
                fold_scores['RMSE'].append(rmse)
                fold_scores['MAE'].append(mae)
                fold_scores['MSE'].append(mse)
                
                print(f"    Fold {fold_num}/{k}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}")
                
            except Exception as e:
                print(f"    Fold {fold_num}/{k} failed: {e}")
                continue
        
        if len(fold_scores['R²']) > 0:
            mean_r2 = np.mean(fold_scores['R²'])
            std_r2 = np.std(fold_scores['R²'])
            mean_rmse = np.mean(fold_scores['RMSE'])
            std_rmse = np.std(fold_scores['RMSE'])
            mean_mae = np.mean(fold_scores['MAE'])
            std_mae = np.std(fold_scores['MAE'])
            
            print(f"\n  Voting Ensemble {k}-Fold CV Results:")
            print(f"    R²:   {mean_r2:.4f} ± {std_r2:.4f}")
            print(f"    RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
            print(f"    MAE:  {mean_mae:.4f} ± {std_mae:.4f}")
            
            # 返回最佳fold的模型
            best_fold_idx = np.argmax(fold_scores['R²'])
            
            return {
                'model': fold_models[best_fold_idx],
                'preprocessors': fold_preprocessors[best_fold_idx],
                'cv_scores': fold_scores,
                'scores': {
                    'mean_r2': mean_r2,
                    'std_r2': std_r2,
                    'mean_rmse': mean_rmse,
                    'std_rmse': std_rmse,
                    'mean_mae': mean_mae,
                    'std_mae': std_mae,
                    'mean_mse': np.mean(fold_scores['MSE']),
                    'std_mse': np.std(fold_scores['MSE'])
                }
            }
        return None

    def _k_fold_ensemble_stacking(self, X_full, y_full, k, noise_type, feature_type):
        """Stacking集成的K折验证"""
        kf = KFold(n_splits=k, shuffle=True, random_state=42)
        fold_scores = {'R²': [], 'RMSE': [], 'MAE': [], 'MSE': []}
        fold_models = []
        fold_preprocessors = []
        
        for fold_num, (train_index, val_index) in enumerate(kf.split(X_full), 1):
            X_train_fold = X_full[train_index]
            X_val_fold = X_full[val_index]
            y_train_fold = y_full[train_index]
            y_val_fold = y_full[val_index]
            
            try:
                # 预处理
                X_train_scaled, X_val_scaled, y_train_transformed, scaler, target_transformer = \
                    self.advanced_preprocessing(X_train_fold, X_val_fold, y_train_fold)
                
                X_train_final, X_val_final, feature_selector, pca_transformer = \
                    self.advanced_feature_engineering(X_train_scaled, X_val_scaled, y_train_transformed)
                
                # 创建并训练Stacking模型
                estimators = []
                for model_name in self.model_configs.keys():
                    model = self.model_configs[model_name]['class'](
                        **self.model_configs[model_name]['params']
                    )
                    estimators.append((model_name, model))
                
                stacking_model = StackingRegressor(
                    estimators=estimators,
                    final_estimator=Lasso(alpha=1.0, random_state=42),
                    cv=5,
                    n_jobs=-1
                )
                stacking_model.fit(X_train_final, y_train_transformed)
                fold_models.append(stacking_model)
                
                # 保存预处理器
                fold_preprocessors.append({
                    'scaler': scaler,
                    'target_transformer': target_transformer,
                    'feature_selector': feature_selector,
                    'pca_transformer': pca_transformer
                })
                
                # 预测
                y_pred = stacking_model.predict(X_val_final)
                
                if target_transformer:
                    y_pred = target_transformer.inverse_transform(
                        y_pred.reshape(-1, 1)
                    ).flatten()
                
                # 计算指标
                r2 = r2_score(y_val_fold, y_pred)
                rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
                mae = mean_absolute_error(y_val_fold, y_pred)
                mse = mean_squared_error(y_val_fold, y_pred)
                
                fold_scores['R²'].append(r2)
                fold_scores['RMSE'].append(rmse)
                fold_scores['MAE'].append(mae)
                fold_scores['MSE'].append(mse)
                
                print(f"    Fold {fold_num}/{k}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}")
                
            except Exception as e:
                print(f"    Fold {fold_num}/{k} failed: {e}")
                continue
        
        if len(fold_scores['R²']) > 0:
            mean_r2 = np.mean(fold_scores['R²'])
            std_r2 = np.std(fold_scores['R²'])
            mean_rmse = np.mean(fold_scores['RMSE'])
            std_rmse = np.std(fold_scores['RMSE'])
            mean_mae = np.mean(fold_scores['MAE'])
            std_mae = np.std(fold_scores['MAE'])
            
            print(f"\n  Stacking Ensemble {k}-Fold CV Results:")
            print(f"    R²:   {mean_r2:.4f} ± {std_r2:.4f}")
            print(f"    RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")
            print(f"    MAE:  {mean_mae:.4f} ± {std_mae:.4f}")
            
            # 返回最佳fold的模型
            best_fold_idx = np.argmax(fold_scores['R²'])
            
            return {
                'model': fold_models[best_fold_idx],
                'preprocessors': fold_preprocessors[best_fold_idx],
                'cv_scores': fold_scores,
                'scores': {
                    'mean_r2': mean_r2,
                    'std_r2': std_r2,
                    'mean_rmse': mean_rmse,
                    'std_rmse': std_rmse,
                    'mean_mae': mean_mae,
                    'std_mae': std_mae,
                    'mean_mse': np.mean(fold_scores['MSE']),
                    'std_mse': np.std(fold_scores['MSE'])
                }
            }
        return None

    def optimize_hyperparameters(self, X_train, y_train, model_name, n_iter=10):
        """超参数优化"""
        if 'param_grid' not in self.model_configs[model_name]:
            return self.model_configs[model_name]['class'](
                **self.model_configs[model_name]['params']
            )
        
        model_class = self.model_configs[model_name]['class']
        param_grid = self.model_configs[model_name]['param_grid']
        
        # 创建基础模型
        if model_name in ['XGBoost', 'LightGBM']:
            base_params = {'random_state': 42, 'n_jobs': -1}
            if model_name == 'XGBoost':
                base_params['verbosity'] = 0
            else:
                base_params['verbose'] = -1
            base_model = model_class(**base_params)
        elif model_name == 'KNN':
            base_model = model_class(n_jobs=-1)
        elif model_name == 'Lasso':
            base_model = model_class(random_state=42)
        else:
            base_model = model_class(random_state=42)
        
        random_search = RandomizedSearchCV(
            base_model,
            param_grid,
            n_iter=n_iter,
            cv=3,
            scoring='r2',
            n_jobs=-1,
            random_state=42,
            verbose=0
        )
        
        try:
            random_search.fit(X_train, y_train)
            return random_search.best_estimator_
        except Exception as e:
            return self.model_configs[model_name]['class'](
                **self.model_configs[model_name]['params']
            )

    def save_all_models(self, save_dir='saved_models'):
        """保存所有最佳模型和预处理器"""
        print(f"\n{'='*120}")
        print("SAVING ALL BEST MODELS...")
        print(f"{'='*120}")
        
        # 创建主目录
        os.makedirs(save_dir, exist_ok=True)
        
        saved_count = 0
        
        for noise_type in self.noise_types:
            if noise_type not in self.best_models:
                continue
                
            # 创建噪声类型目录
            noise_dir = os.path.join(save_dir, noise_type.replace(' ', '_'))
            os.makedirs(noise_dir, exist_ok=True)
            
            for feature_type in self.feature_types:
                if feature_type not in self.best_models[noise_type]:
                    continue
                
                # 创建特征类型目录
                feature_dir = os.path.join(noise_dir, feature_type)
                os.makedirs(feature_dir, exist_ok=True)
                
                for model_name, model_data in self.best_models[noise_type][feature_type].items():
                    try:
                        # 保存模型
                        model_path = os.path.join(feature_dir, f'{model_name}_model.pkl')
                        joblib.dump(model_data['model'], model_path)
                        
                        # 保存预处理器
                        preprocessor_path = os.path.join(feature_dir, f'{model_name}_preprocessors.pkl')
                        joblib.dump(model_data['preprocessors'], preprocessor_path)
                        
                        # 保存模型信息
                        info = {
                            'noise_type': noise_type,
                            'feature_type': feature_type,
                            'model_name': model_name,
                            'mean_r2': model_data.get('mean_r2', model_data.get('scores', {}).get('mean_r2', 0)),
                            'mean_rmse': model_data.get('mean_rmse', model_data.get('scores', {}).get('mean_rmse', 0)),
                            'mean_mae': model_data.get('mean_mae', model_data.get('scores', {}).get('mean_mae', 0))
                        }
                        info_path = os.path.join(feature_dir, f'{model_name}_info.pkl')
                        joblib.dump(info, info_path)
                        
                        saved_count += 1
                        print(f"  ✓ Saved: {noise_type} - {feature_type} - {model_name}")
                        
                    except Exception as e:
                        print(f"  ✗ Failed to save {noise_type} - {feature_type} - {model_name}: {e}")
        
        # 保存完整的predictor对象
        try:
            predictor_path = os.path.join(save_dir, 'predictor_complete.pkl')
            joblib.dump(self, predictor_path)
            print(f"\n  ✓ Saved complete predictor object")
        except Exception as e:
            print(f"\n  ✗ Failed to save complete predictor: {e}")
        
        print(f"\n{'='*120}")
        print(f"Successfully saved {saved_count} models to '{save_dir}' directory")
        print(f"{'='*120}")

    def load_saved_model(self, noise_type, feature_type, model_name, save_dir='saved_models'):
        """加载保存的模型"""
        noise_dir = os.path.join(save_dir, noise_type.replace(' ', '_'))
        feature_dir = os.path.join(noise_dir, feature_type)
        
        # 加载模型
        model_path = os.path.join(feature_dir, f'{model_name}_model.pkl')
        model = joblib.load(model_path)
        
        # 加载预处理器
        preprocessor_path = os.path.join(feature_dir, f'{model_name}_preprocessors.pkl')
        preprocessors = joblib.load(preprocessor_path)
        
        # 加载模型信息
        info_path = os.path.join(feature_dir, f'{model_name}_info.pkl')
        info = joblib.load(info_path)
        
        print(f"Loaded model: {noise_type} - {feature_type} - {model_name}")
        print(f"  R²: {info['mean_r2']:.4f}")
        print(f"  RMSE: {info['mean_rmse']:.4f}")
        print(f"  MAE: {info['mean_mae']:.4f}")
        
        return model, preprocessors, info

    def predict_with_saved_model(self, X_new, noise_type, feature_type, model_name, save_dir='saved_models'):
        """使用保存的模型进行预测"""
        # 加载模型和预处理器
        model, preprocessors, info = self.load_saved_model(
            noise_type, feature_type, model_name, save_dir
        )
        
        # 应用预处理
        X_processed = X_new
        
        if preprocessors['scaler'] is not None:
            X_processed = preprocessors['scaler'].transform(X_processed)
        
        if preprocessors['feature_selector'] is not None:
            X_processed = preprocessors['feature_selector'].transform(X_processed)
        
        if preprocessors['pca_transformer'] is not None:
            X_processed = preprocessors['pca_transformer'].transform(X_processed)
        
        # 预测
        y_pred = model.predict(X_processed)
        
        # 逆变换
        if preprocessors['target_transformer'] is not None:
            y_pred = preprocessors['target_transformer'].inverse_transform(
                y_pred.reshape(-1, 1)
            ).flatten()
        
        return y_pred

    @staticmethod
    def load_complete_predictor(save_dir='saved_models'):
        """加载完整的predictor对象"""
        predictor_path = os.path.join(save_dir, 'predictor_complete.pkl')
        predictor = joblib.load(predictor_path)
        print(f"Loaded complete predictor from '{save_dir}'")
        return predictor

    def _display_kfold_results(self, results_df):
        """显示K折验证结果"""
        print(f"\n{'='*120}")
        print("                          K-FOLD CROSS-VALIDATION RESULTS SUMMARY")
        print(f"{'='*120}")
        
        # 按噪声类型分组显示
        for noise_type in self.noise_types:
            print(f"\n{'='*100}")
            print(f"NOISE TYPE: {noise_type}")
            print(f"{'='*100}")
            
            subset = results_df[results_df['Noise Type'] == noise_type]
            
            if not subset.empty:
                # 按R²排序并显示前10名
                top_models = subset.nlargest(10, 'CV R² Mean')
                
                print(f"\nTop 10 Models (by R² Score):")
                print("-" * 100)
                for idx, row in top_models.iterrows():
                    print(f"{row['Model']:20} ({row['Model Type']:10}) + {row['Feature Type']:15}")
                    print(f"  R²:   {row['CV R² Mean']:.4f} ± {row['CV R² Std']:.4f}")
                    print(f"  RMSE: {row['CV RMSE Mean']:.4f} ± {row['CV RMSE Std']:.4f}")
                    print(f"  MAE:  {row['CV MAE Mean']:.4f} ± {row['CV MAE Std']:.4f}")
                    print("-" * 100)
        
        # 显示整体最佳模型
        print(f"\n{'='*120}")
        print("                          OVERALL TOP 10 BEST MODELS")
        print(f"{'='*120}")
        
        overall_best = results_df.nlargest(10, 'CV R² Mean')
        for idx, row in overall_best.iterrows():
            print(f"\n{row['Model']:20} ({row['Model Type']:10})")
            print(f"  Noise: {row['Noise Type']}, Features: {row['Feature Type']}")
            print(f"  R²:   {row['CV R² Mean']:.4f} ± {row['CV R² Std']:.4f}")
            print(f"  RMSE: {row['CV RMSE Mean']:.4f} ± {row['CV RMSE Std']:.4f}")
            print(f"  MAE:  {row['CV MAE Mean']:.4f} ± {row['CV MAE Std']:.4f}")
        
        print(f"\n{'='*120}")


# ============ 主程序调用 ============
if __name__ == "__main__":
    # 创建预测器
    predictor = ComprehensiveKFoldNoisePredictor()

    # 文件路径
    street_view_path = './revise_data/features_dinov3-vitb16-pretrain-lvd1689m.npy'
    remote_sensing_path = './revise_data/features_dinov3-vitl16-pretrain-sat493m.npy'
    noise_labels_path = './revise_data/noise_labels_25.npy'

    try:
        # 加载数据
        print("Loading data...")
        street_features, remote_features, fusion_features, labels = predictor.load_data(
            street_view_path, remote_sensing_path, noise_labels_path
        )
        
        # 准备特征字典
        features_dict = {
            'street': street_features,
            'remote': remote_features,
            'fusion': fusion_features
        }
        
        # 执行K折交叉验证并保存模型
        print("\nStarting comprehensive K-fold cross-validation...")
        results_df = predictor.k_fold_cross_validation_all_models(
            features_dict=features_dict,
            labels=labels,
            k=5,  # 5折交叉验证
            optimize_hyperparams=False,  # 设为True可启用超参数优化（会更慢）
            save_best_models=True  # 自动保存所有最佳模型
        )
        
        print(f"\n{'='*120}")
        print("                           EXPERIMENT COMPLETED SUCCESSFULLY")
        print(f"{'='*120}")
        print("\nSummary:")
        print("• Total models evaluated: 9 (7 base models + 2 ensemble models)")
        print("• Base models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso")
        print("• Ensemble models: Voting (7 models), Stacking (7 models + Lasso)")
        print("• Feature types: Street View, Remote Sensing, Fusion")
        print("• Noise types: Mean Noise, Low Freq, Mid Freq, High Freq")
        print("• Cross-validation: 5-fold")
        print("• Metrics: R², RMSE, MAE, MSE (all with mean ± std)")
        print("• All models saved to 'saved_models/' directory")
        print("\nResults saved to 'comprehensive_kfold_results.csv'")
        
        # ============ 演示如何使用保存的模型 ============
        print(f"\n{'='*120}")
        print("DEMONSTRATION: How to Load and Use Saved Models")
        print(f"{'='*120}")
        
        # 示例1: 加载单个模型
        print("\n1. Loading a single model:")
        try:
            model, preprocessors, info = predictor.load_saved_model(
                noise_type='Mean Noise',
                feature_type='fusion',
                model_name='XGBoost',
                save_dir='saved_models'
            )
        except:
            print("  (Model not found - will be available after training)")
        
        # 示例2: 使用保存的模型进行预测
        print("\n2. Making predictions with a saved model:")
        print("  predictions = predictor.predict_with_saved_model(")
        print("      X_new=new_data,")
        print("      noise_type='Mean Noise',")
        print("      feature_type='fusion',")
        print("      model_name='XGBoost'")
        print("  )")
        
        # 示例3: 加载完整的predictor
        print("\n3. Loading complete predictor:")
        print("  loaded_predictor = ComprehensiveKFoldNoisePredictor.load_complete_predictor('saved_models')")
        
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Please ensure the following files exist:")
        print(f"- {street_view_path}")
        print(f"- {remote_sensing_path}")
        print(f"- {noise_labels_path}")
    except Exception as e:
        print(f"Error occurred: {e}")
        import traceback
        traceback.print_exc()

Loading data...
Street view features shape: (923, 768)
Remote sensing features shape: (923, 1024)
Noise labels shape: (923, 4)

Starting comprehensive K-fold cross-validation...

          K-FOLD CROSS-VALIDATION FOR ALL 8 MODELS + ENSEMBLES (K=5)
Models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso
Ensembles: Voting (7 models), Stacking (7 models + Lasso)

NOISE TYPE: Mean Noise

----------------------------------------------------------------------------------------------------
Feature Type: Street View
----------------------------------------------------------------------------------------------------

  Training XGBoost...
    Fold 1/5: R²=0.3822, RMSE=2.8377, MAE=2.3427
    Fold 2/5: R²=0.2627, RMSE=3.1293, MAE=2.4382
    Fold 3/5: R²=0.2944, RMSE=3.3126, MAE=2.4554
    Fold 4/5: R²=0.2802, RMSE=3.1211, MAE=2.4876
    Fold 5/5: R²=0.2302, RMSE=3.4768, MAE=2.7839

  XGBoost 5-Fold CV Results:
    R²:   0.2900 ± 0.0509
    RMSE: 3.1755 ± 0.2139
    MAE:  2.50

In [2]:
# 创建预测器
predictor = ComprehensiveKFoldNoisePredictor()

# 文件路径
street_view_path = './revise_data/features_dinov3-vitb16-pretrain-lvd1689m.npy'
remote_sensing_path = './revise_data/features_dinov3-vitl16-pretrain-sat493m.npy'
noise_labels_path = './revise_data/noise_labels_50.npy'

try:
    # 加载数据
    print("Loading data...")
    street_features, remote_features, fusion_features, labels = predictor.load_data(
        street_view_path, remote_sensing_path, noise_labels_path
    )
    
    # 准备特征字典
    features_dict = {
        'street': street_features,
        'remote': remote_features,
        'fusion': fusion_features
    }
    
    # 执行K折交叉验证并保存模型
    print("\nStarting comprehensive K-fold cross-validation...")
    results_df = predictor.k_fold_cross_validation_all_models(
        features_dict=features_dict,
        labels=labels,
        k=5,  # 5折交叉验证
        optimize_hyperparams=False,  # 设为True可启用超参数优化（会更慢）
        save_best_models=True  # 自动保存所有最佳模型
    )
    
    print(f"\n{'='*120}")
    print("                           EXPERIMENT COMPLETED SUCCESSFULLY")
    print(f"{'='*120}")
    print("\nSummary:")
    print("• Total models evaluated: 9 (7 base models + 2 ensemble models)")
    print("• Base models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso")
    print("• Ensemble models: Voting (7 models), Stacking (7 models + Lasso)")
    print("• Feature types: Street View, Remote Sensing, Fusion")
    print("• Noise types: Mean Noise, Low Freq, Mid Freq, High Freq")
    print("• Cross-validation: 5-fold")
    print("• Metrics: R², RMSE, MAE, MSE (all with mean ± std)")
    print("• All models saved to 'saved_models/' directory")
    print("\nResults saved to 'comprehensive_kfold_results.csv'")
    
    # ============ 演示如何使用保存的模型 ============
    print(f"\n{'='*120}")
    print("DEMONSTRATION: How to Load and Use Saved Models")
    print(f"{'='*120}")
    
    # 示例1: 加载单个模型
    print("\n1. Loading a single model:")
    try:
        model, preprocessors, info = predictor.load_saved_model(
            noise_type='Mean Noise',
            feature_type='fusion',
            model_name='XGBoost',
            save_dir='saved_models'
        )
    except:
        print("  (Model not found - will be available after training)")
    
    # 示例2: 使用保存的模型进行预测
    print("\n2. Making predictions with a saved model:")
    print("  predictions = predictor.predict_with_saved_model(")
    print("      X_new=new_data,")
    print("      noise_type='Mean Noise',")
    print("      feature_type='fusion',")
    print("      model_name='XGBoost'")
    print("  )")
    
    # 示例3: 加载完整的predictor
    print("\n3. Loading complete predictor:")
    print("  loaded_predictor = ComprehensiveKFoldNoisePredictor.load_complete_predictor('saved_models')")
    
except FileNotFoundError as e:
    print(f"File not found: {e}")
    print("Please ensure the following files exist:")
    print(f"- {street_view_path}")
    print(f"- {remote_sensing_path}")
    print(f"- {noise_labels_path}")
except Exception as e:
    print(f"Error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading data...
Street view features shape: (923, 768)
Remote sensing features shape: (923, 1024)
Noise labels shape: (923, 4)

Starting comprehensive K-fold cross-validation...

          K-FOLD CROSS-VALIDATION FOR ALL 8 MODELS + ENSEMBLES (K=5)
Models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso
Ensembles: Voting (7 models), Stacking (7 models + Lasso)

NOISE TYPE: Mean Noise

----------------------------------------------------------------------------------------------------
Feature Type: Street View
----------------------------------------------------------------------------------------------------

  Training XGBoost...
    Fold 1/5: R²=0.3703, RMSE=2.6496, MAE=2.0483
    Fold 2/5: R²=0.3269, RMSE=2.6060, MAE=1.9446
    Fold 3/5: R²=0.3038, RMSE=2.8343, MAE=2.1008
    Fold 4/5: R²=0.3439, RMSE=2.6721, MAE=2.0263
    Fold 5/5: R²=0.3293, RMSE=2.9291, MAE=2.1659

  XGBoost 5-Fold CV Results:
    R²:   0.3348 ± 0.0219
    RMSE: 2.7382 ± 0.1228
    MAE:  2.05

In [3]:
# 创建预测器
predictor = ComprehensiveKFoldNoisePredictor()

# 文件路径
street_view_path = './revise_data/features_dinov3-vitb16-pretrain-lvd1689m.npy'
remote_sensing_path = './revise_data/features_dinov3-vitl16-pretrain-sat493m.npy'
noise_labels_path = './revise_data/noise_labels_100.npy'

try:
    # 加载数据
    print("Loading data...")
    street_features, remote_features, fusion_features, labels = predictor.load_data(
        street_view_path, remote_sensing_path, noise_labels_path
    )
    
    # 准备特征字典
    features_dict = {
        'street': street_features,
        'remote': remote_features,
        'fusion': fusion_features
    }
    
    # 执行K折交叉验证并保存模型
    print("\nStarting comprehensive K-fold cross-validation...")
    results_df = predictor.k_fold_cross_validation_all_models(
        features_dict=features_dict,
        labels=labels,
        k=5,  # 5折交叉验证
        optimize_hyperparams=False,  # 设为True可启用超参数优化（会更慢）
        save_best_models=True  # 自动保存所有最佳模型
    )
    
    print(f"\n{'='*120}")
    print("                           EXPERIMENT COMPLETED SUCCESSFULLY")
    print(f"{'='*120}")
    print("\nSummary:")
    print("• Total models evaluated: 9 (7 base models + 2 ensemble models)")
    print("• Base models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso")
    print("• Ensemble models: Voting (7 models), Stacking (7 models + Lasso)")
    print("• Feature types: Street View, Remote Sensing, Fusion")
    print("• Noise types: Mean Noise, Low Freq, Mid Freq, High Freq")
    print("• Cross-validation: 5-fold")
    print("• Metrics: R², RMSE, MAE, MSE (all with mean ± std)")
    print("• All models saved to 'saved_models/' directory")
    print("\nResults saved to 'comprehensive_kfold_results.csv'")
    
    # ============ 演示如何使用保存的模型 ============
    print(f"\n{'='*120}")
    print("DEMONSTRATION: How to Load and Use Saved Models")
    print(f"{'='*120}")
    
    # 示例1: 加载单个模型
    print("\n1. Loading a single model:")
    try:
        model, preprocessors, info = predictor.load_saved_model(
            noise_type='Mean Noise',
            feature_type='fusion',
            model_name='XGBoost',
            save_dir='saved_models'
        )
    except:
        print("  (Model not found - will be available after training)")
    
    # 示例2: 使用保存的模型进行预测
    print("\n2. Making predictions with a saved model:")
    print("  predictions = predictor.predict_with_saved_model(")
    print("      X_new=new_data,")
    print("      noise_type='Mean Noise',")
    print("      feature_type='fusion',")
    print("      model_name='XGBoost'")
    print("  )")
    
    # 示例3: 加载完整的predictor
    print("\n3. Loading complete predictor:")
    print("  loaded_predictor = ComprehensiveKFoldNoisePredictor.load_complete_predictor('saved_models')")
    
except FileNotFoundError as e:
    print(f"File not found: {e}")
    print("Please ensure the following files exist:")
    print(f"- {street_view_path}")
    print(f"- {remote_sensing_path}")
    print(f"- {noise_labels_path}")
except Exception as e:
    print(f"Error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading data...
Street view features shape: (923, 768)
Remote sensing features shape: (923, 1024)
Noise labels shape: (923, 4)

Starting comprehensive K-fold cross-validation...

          K-FOLD CROSS-VALIDATION FOR ALL 8 MODELS + ENSEMBLES (K=5)
Models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso
Ensembles: Voting (7 models), Stacking (7 models + Lasso)

NOISE TYPE: Mean Noise

----------------------------------------------------------------------------------------------------
Feature Type: Street View
----------------------------------------------------------------------------------------------------

  Training XGBoost...
    Fold 1/5: R²=0.4111, RMSE=2.2515, MAE=1.7678
    Fold 2/5: R²=0.3657, RMSE=2.1504, MAE=1.5759
    Fold 3/5: R²=0.3743, RMSE=2.4517, MAE=1.7801
    Fold 4/5: R²=0.3629, RMSE=2.4104, MAE=1.8008
    Fold 5/5: R²=0.3648, RMSE=2.5581, MAE=1.8688

  XGBoost 5-Fold CV Results:
    R²:   0.3757 ± 0.0181
    RMSE: 2.3644 ± 0.1455
    MAE:  1.75

In [4]:
# 创建预测器
predictor = ComprehensiveKFoldNoisePredictor()

# 文件路径
street_view_path = './revise_data/features_dinov3-vitb16-pretrain-lvd1689m.npy'
remote_sensing_path = './revise_data/features_dinov3-vitl16-pretrain-sat493m.npy'
noise_labels_path = './revise_data/noise_labels_15_new_2.npy'

try:
    # 加载数据
    print("Loading data...")
    street_features, remote_features, fusion_features, labels = predictor.load_data(
        street_view_path, remote_sensing_path, noise_labels_path
    )
    
    # 准备特征字典
    features_dict = {
        'street': street_features,
        'remote': remote_features,
        'fusion': fusion_features
    }
    
    # 执行K折交叉验证并保存模型
    print("\nStarting comprehensive K-fold cross-validation...")
    results_df = predictor.k_fold_cross_validation_all_models(
        features_dict=features_dict,
        labels=labels,
        k=5,  # 5折交叉验证
        optimize_hyperparams=False,  # 设为True可启用超参数优化（会更慢）
        save_best_models=True  # 自动保存所有最佳模型
    )
    
    print(f"\n{'='*120}")
    print("                           EXPERIMENT COMPLETED SUCCESSFULLY")
    print(f"{'='*120}")
    print("\nSummary:")
    print("• Total models evaluated: 9 (7 base models + 2 ensemble models)")
    print("• Base models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso")
    print("• Ensemble models: Voting (7 models), Stacking (7 models + Lasso)")
    print("• Feature types: Street View, Remote Sensing, Fusion")
    print("• Noise types: Mean Noise, Low Freq, Mid Freq, High Freq")
    print("• Cross-validation: 5-fold")
    print("• Metrics: R², RMSE, MAE, MSE (all with mean ± std)")
    print("• All models saved to 'saved_models/' directory")
    print("\nResults saved to 'comprehensive_kfold_results.csv'")
    
    # ============ 演示如何使用保存的模型 ============
    print(f"\n{'='*120}")
    print("DEMONSTRATION: How to Load and Use Saved Models")
    print(f"{'='*120}")
    
    # 示例1: 加载单个模型
    print("\n1. Loading a single model:")
    try:
        model, preprocessors, info = predictor.load_saved_model(
            noise_type='Mean Noise',
            feature_type='fusion',
            model_name='XGBoost',
            save_dir='saved_models'
        )
    except:
        print("  (Model not found - will be available after training)")
    
    # 示例2: 使用保存的模型进行预测
    print("\n2. Making predictions with a saved model:")
    print("  predictions = predictor.predict_with_saved_model(")
    print("      X_new=new_data,")
    print("      noise_type='Mean Noise',")
    print("      feature_type='fusion',")
    print("      model_name='XGBoost'")
    print("  )")
    
    # 示例3: 加载完整的predictor
    print("\n3. Loading complete predictor:")
    print("  loaded_predictor = ComprehensiveKFoldNoisePredictor.load_complete_predictor('saved_models')")
    
except FileNotFoundError as e:
    print(f"File not found: {e}")
    print("Please ensure the following files exist:")
    print(f"- {street_view_path}")
    print(f"- {remote_sensing_path}")
    print(f"- {noise_labels_path}")
except Exception as e:
    print(f"Error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading data...
Street view features shape: (923, 768)
Remote sensing features shape: (923, 1024)
Noise labels shape: (923, 4)

Starting comprehensive K-fold cross-validation...

          K-FOLD CROSS-VALIDATION FOR ALL 8 MODELS + ENSEMBLES (K=5)
Models: XGBoost, LightGBM, Random Forest, Gradient Boosting, SVR, KNN, Lasso
Ensembles: Voting (7 models), Stacking (7 models + Lasso)

NOISE TYPE: Mean Noise

----------------------------------------------------------------------------------------------------
Feature Type: Street View
----------------------------------------------------------------------------------------------------

  Training XGBoost...
    Fold 1/5: R²=0.3044, RMSE=3.1386, MAE=2.5488
    Fold 2/5: R²=0.2342, RMSE=3.3780, MAE=2.6315
    Fold 3/5: R²=0.2608, RMSE=3.4966, MAE=2.5975
    Fold 4/5: R²=0.2835, RMSE=3.3013, MAE=2.5644
    Fold 5/5: R²=0.2110, RMSE=3.6844, MAE=2.9019

  XGBoost 5-Fold CV Results:
    R²:   0.2588 ± 0.0334
    RMSE: 3.3998 ± 0.1837
    MAE:  2.64