In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (mean_squared_error, mean_absolute_error, 
                            r2_score, explained_variance_score)
from sklearn.inspection import (permutation_importance, 
                              PartialDependenceDisplay)
import shap
from scipy import stats
import time
from typing import Dict, List, Tuple, Optional

In [None]:
class ModelEvaluator:
    def __init__(self, models_path: Dict[str, str], test_data_path: str):
        """
        Initialize the evaluator with model paths and test data
        
        Args:
            models_path: Dictionary of model names and their file paths
            test_data_path: Path to test data (CSV)
        """
        self.models = self._load_models(models_path)
        self.test_data = pd.read_csv(test_data_path)
        self.X_test = self.test_data.drop('price', axis=1)
        self.y_test = self.test_data['price']
        self.results = {}
        
    def _load_models(self, model_paths: Dict[str, str]) -> Dict[str, object]:
        """Load pretrained models from pickle files"""
        return {name: pickle.load(open(path, 'rb')) for name, path in model_paths.items()}
    
    def evaluate_all(self) -> Dict[str, Dict]:
        """Run complete evaluation suite for all models"""
        for name, model in self.models.items():
            print(f"\nEvaluating {name}...")
            self.results[name] = {}
            
            # Basic metrics
            self.results[name]['metrics'] = self._calculate_metrics(model)
            
            # Feature importance
            self.results[name]['importance'] = self._feature_importance(model, name)
            
            # Inference speed
            self.results[name]['inference_speed'] = self._measure_inference_speed(model)
            
            # Residual analysis
            self.results[name]['residuals'] = self._analyze_residuals(model)
            
            # Statistical tests
            if len(self.models) > 1:
                self.results[name]['normality_test'] = self._check_residual_normality(
                    self.results[name]['residuals']['residuals']
                )
        
        # Comparative analysis
        if len(self.models) > 1:
            self._compare_models()
            
        return self.results
    
    def _calculate_metrics(self, model) -> Dict[str, float]:
        """Calculate evaluation metrics"""
        y_pred = model.predict(self.X_test)
        
        return {
            'RMSE': np.sqrt(mean_squared_error(self.y_test, y_pred)),
            'MAE': mean_absolute_error(self.y_test, y_pred),
            'R2': r2_score(self.y_test, y_pred),
            'Explained Variance': explained_variance_score(self.y_test, y_pred),
            'Mean Absolute Percentage Error': np.mean(np.abs((self.y_test - y_pred) / self.y_test)) * 100
        }
    
    def _feature_importance(self, model, model_name: str) -> Dict:
        """Calculate feature importance using multiple methods"""
        importance = {}
        
        # Permutation importance
        perm_result = permutation_importance(
            model, self.X_test, self.y_test, n_repeats=10, random_state=42
        )
        importance['permutation'] = {
            'importances': perm_result.importances_mean,
            'std': perm_result.importances_std
        }
        
        # Model-specific importance
        if hasattr(model, 'feature_importances_'):
            importance['native'] = model.feature_importances_
            
        # SHAP values (for tree-based models)
        if model_name in ['Random Forest', 'Decision Tree', 'XGBoost']:
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(self.X_test)
            importance['shap'] = np.abs(shap_values).mean(axis=0)
        
        return importance
    
    def _measure_inference_speed(self, model, n_iter: int = 100) -> Dict:
        """Measure inference latency"""
        times = []
        for _ in range(n_iter):
            start = time.time()
            model.predict(self.X_test.iloc[:1])  # Single prediction
            times.append(time.time() - start)
        
        return {
            'mean_latency_ms': np.mean(times) * 1000,
            'p95_latency_ms': np.percentile(times, 95) * 1000
        }
    
    def _analyze_residuals(self, model) -> Dict:
        """Analyze prediction residuals"""
        y_pred = model.predict(self.X_test)
        residuals = self.y_test - y_pred
        
        return {
            'residuals': residuals,
            'residual_mean': np.mean(residuals),
            'residual_std': np.std(residuals),
            'residual_skew': stats.skew(residuals)
        }
    
    def _check_residual_normality(self, residuals) -> Dict:
        """Check if residuals follow normal distribution"""
        stat, p = stats.shapiro(residuals)
        return {
            'shapiro_stat': stat,
            'shapiro_p': p,
            'is_normal': p > 0.05
        }
    
    def _compare_models(self):
        """Compare models using statistical tests"""
        model_names = list(self.models.keys())
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                m1 = model_names[i]
                m2 = model_names[j]
                
                res1 = self.results[m1]['residuals']['residuals']
                res2 = self.results[m2]['residuals']['residuals']
                
                # Paired t-test
                t_stat, p_val = stats.ttest_rel(res1, res2)
                self.results[f'{m1}_vs_{m2}'] = {
                    't_test': {'statistic': t_stat, 'p_value': p_val},
                    'better_model': m1 if np.mean(np.abs(res1)) < np.mean(np.abs(res2)) else m2
                }
    
    def generate_report(self) -> None:
        """Generate comprehensive evaluation report"""
        print("\n=== MODEL EVALUATION REPORT ===\n")
        
        # Metrics comparison
        metrics_df = pd.DataFrame({
            name: data['metrics'] for name, data in self.results.items() 
            if 'metrics' in data
        }).T
        print("Performance Metrics:")
        print(metrics_df.sort_values('RMSE'))
        
        # Feature importance visualization
        self._plot_feature_importance()
        
        # Residual analysis
        self._plot_residuals()
        
        # Inference speed comparison
        speed_df = pd.DataFrame({
            name: data['inference_speed'] for name, data in self.results.items()
            if 'inference_speed' in data
        }).T
        print("\nInference Speed (ms):")
        print(speed_df.sort_values('mean_latency_ms'))
    
    def _plot_feature_importance(self) -> None:
        """Visualize feature importance across models"""
        plt.figure(figsize=(15, 8))
        for i, (name, data) in enumerate(self.results.items()):
            if 'importance' not in data:
                continue
                
            plt.subplot(2, 2, i+1)
            if 'shap' in data['importance']:
                importances = data['importance']['shap']
            elif 'native' in data['importance']:
                importances = data['importance']['native']
            else:
                importances = data['importance']['permutation']['importances']
                
            sorted_idx = np.argsort(importances)
            plt.barh(self.X_test.columns[sorted_idx], importances[sorted_idx])
            plt.title(f"{name} Feature Importance")
        plt.tight_layout()
        plt.show()
    
    def _plot_residuals(self) -> None:
        """Plot residual distributions"""
        plt.figure(figsize=(15, 5))
        for i, (name, data) in enumerate(self.results.items()):
            if 'residuals' not in data:
                continue
                
            plt.subplot(1, len(self.models), i+1)
            sns.histplot(data['residuals']['residuals'], kde=True)
            plt.title(f"{name} Residuals")
        plt.tight_layout()
        plt.show()

In [None]:
if __name__ == "__main__":
    # Configuration
    MODEL_PATHS = {
        "Decision Tree": "decision_tree_model.pkl",
        "Random Forest": "random_forest_model.pkl",
        "XGBoost": "xgboost_model.pkl",
        "Ridge": "ridge_model.pkl"
    }
    TEST_DATA_PATH = "test_data.csv"
    
    # Initialize and run evaluation
    evaluator = ModelEvaluator(MODEL_PATHS, TEST_DATA_PATH)
    results = evaluator.evaluate_all()
    
    # Generate report
    evaluator.generate_report()
    
    # Save full results
    with open('evaluation_results.pkl', 'wb') as f:
        pickle.dump(results, f)