## Model Evaluation

In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\NVDNLP'

In [2]:
# ============================================
#     ENTITY: MODEL EVALUATION CONFIG
# ============================================

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    label_encoder_path: Path
    tfidf_vectorizer_path: Path
    metric_file_name: Path

In [3]:
# ============================================
# ⚙️ CONFIGURATION MANAGER
# ============================================

from src.NVDNLP.utils.common import read_yaml, create_directories 
# from src.NVDNLP.entity.config_entity import ModelEvaluationConfig

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = "config/config.yaml",
        params_filepath = "params.yaml",
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])
    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=Path(config.root_dir),
            test_data_path=Path(config.test_data_path),
            model_path=Path(config.model_path),
            label_encoder_path=Path(config.label_encoder_path),
            tfidf_vectorizer_path=Path(config.tfidf_vectorizer_path),
            metric_file_name=Path(config.metric_file_name)
        )

        return model_evaluation_config

In [4]:
# ============================================
#     MODEL EVALUATION COMPONENT (FIXED FOR CLASS MISMATCH)
# ============================================

import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from src.NVDNLP.entity.config_entity import ModelEvaluationConfig
from src.NVDNLP import logger

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.model = None
        self.label_encoder = None
        self.tfidf_vectorizer = None

    def load_artifacts(self):
        """Load model, label encoder, and TF-IDF vectorizer"""
        try:
            logger.info("     Loading evaluation artifacts...")
            
            # Load trained model
            self.model = joblib.load(self.config.model_path)
            logger.info(f"  Model loaded: {self.config.model_path}")
            
            # Load label encoder
            self.label_encoder = joblib.load(self.config.label_encoder_path)
            logger.info(f"  Label encoder loaded: {self.config.label_encoder_path}")
            logger.info(f"     Label encoder classes: {list(self.label_encoder.classes_)}")
            
            # Load TF-IDF vectorizer
            self.tfidf_vectorizer = joblib.load(self.config.tfidf_vectorizer_path)
            logger.info(f"  TF-IDF vectorizer loaded: {self.config.tfidf_vectorizer_path}")
            
        except Exception as e:
            logger.error(f"   Failed to load artifacts: {e}")
            raise e

    def load_test_data(self):
        """Load and prepare test data - filter to only known classes"""
        try:
            logger.info("     Loading test data...")
            
            # Load test CSV file
            test_df = pd.read_csv(self.config.test_data_path)
            logger.info(f"  Test data loaded: {len(test_df)} samples")
            
            # Get known classes from label encoder
            known_classes = list(self.label_encoder.classes_)
            logger.info(f"     Known severity classes: {known_classes}")
            
            # Check for unknown classes in test data
            unique_test_severities = test_df['Severity'].unique()
            logger.info(f"     All severities in test data: {list(unique_test_severities)}")
            
            # Filter test data to only include known classes
            original_size = len(test_df)
            test_df_filtered = test_df[test_df['Severity'].isin(known_classes)].copy()
            filtered_size = len(test_df_filtered)
            
            if original_size != filtered_size:
                logger.warning(f"     Filtered out {original_size - filtered_size} samples with unknown classes")
                logger.info(f"    Using {filtered_size} samples for evaluation")
            
            # Check encoded severity distribution
            encoded_severity_counts = test_df_filtered['encoded_severity'].value_counts().sort_index()
            logger.info(f"    Encoded severity distribution: {dict(encoded_severity_counts)}")
            
            # Prepare features and labels
            X_test_descriptions = test_df_filtered['Description'].astype(str)
            y_test_encoded = test_df_filtered['encoded_severity']
            
            # Transform descriptions to TF-IDF features
            X_test_tfidf = self.tfidf_vectorizer.transform(X_test_descriptions)
            logger.info(f"  Test features transformed: {X_test_tfidf.shape}")
            
            return X_test_tfidf, y_test_encoded, test_df_filtered
            
        except Exception as e:
            logger.error(f"   Failed to load test data: {e}")
            raise e

    def make_predictions(self, X_test):
        """Make predictions on test data"""
        try:
            logger.info("    Making predictions on test data...")
            
            y_pred = self.model.predict(X_test)
            logger.info(f"  Predictions completed: {len(y_pred)} predictions")
            
            # Check unique values in predictions
            unique_predictions = np.unique(y_pred)
            logger.info(f"    Unique predicted classes: {list(unique_predictions)}")
            
            return y_pred
            
        except Exception as e:
            logger.error(f"   Prediction failed: {e}")
            raise e

    def calculate_metrics(self, y_test, y_pred):
        """Calculate evaluation metrics using only known classes"""
        try:
            logger.info("    Calculating evaluation metrics...")
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            
            # Generate classification report using only known classes
            class_report = classification_report(
                y_test, 
                y_pred, 
                target_names=self.label_encoder.classes_,
                output_dict=True,
                zero_division=0
            )
            
            # Generate confusion matrix
            conf_matrix = confusion_matrix(y_test, y_pred)
            
            # Convert confusion matrix to list for JSON serialization
            conf_matrix_list = conf_matrix.tolist()
            
            logger.info(f"    Accuracy: {accuracy*100:.2f}%")
            
            return {
                'accuracy': accuracy,
                'classification_report': class_report,
                'confusion_matrix': conf_matrix_list,
                'severity_classes': list(self.label_encoder.classes_),
                'test_samples': len(y_test)
            }
            
        except Exception as e:
            logger.error(f"   Metric calculation failed: {e}")
            raise e

    def save_metrics(self, metrics):
        """Save evaluation metrics to file"""
        try:
            logger.info("    Saving evaluation metrics...")
            
            # Create detailed metrics report
            metrics_report = {
                'overall_accuracy': metrics['accuracy'],
                'test_samples': metrics['test_samples'],
                'severity_classes': metrics['severity_classes'],
                'confusion_matrix': metrics['confusion_matrix'],
                'detailed_classification_report': metrics['classification_report']
            }
            
            # Save as CSV for easy viewing
            csv_metrics = {
                'metric': ['overall_accuracy', 'test_samples'],
                'value': [metrics['accuracy'], metrics['test_samples']]
            }
            
            # Add per-class metrics
            for class_name in metrics['severity_classes']:
                if class_name in metrics['classification_report']:
                    class_metrics = metrics['classification_report'][class_name]
                    csv_metrics['metric'].extend([
                        f'{class_name}_precision',
                        f'{class_name}_recall', 
                        f'{class_name}_f1_score',
                        f'{class_name}_support'
                    ])
                    csv_metrics['value'].extend([
                        class_metrics['precision'],
                        class_metrics['recall'],
                        class_metrics['f1-score'],
                        class_metrics['support']
                    ])
            
            # Create DataFrame and save as CSV
            metrics_df = pd.DataFrame(csv_metrics)
            metrics_df.to_csv(self.config.metric_file_name, index=False)
            logger.info(f"  Metrics saved to: {self.config.metric_file_name}")
            
            # Save detailed report as JSON
            import json
            detailed_metrics_file = self.config.metric_file_name.with_suffix('.json')
            with open(detailed_metrics_file, 'w') as f:
                json.dump(metrics_report, f, indent=4)
            logger.info(f"  Detailed metrics saved to: {detailed_metrics_file}")
            
        except Exception as e:
            logger.error(f"   Failed to save metrics: {e}")
            raise e

    def print_evaluation_summary(self, metrics, y_test, y_pred):
        """Print comprehensive evaluation summary"""
        try:
            logger.info("\n" + "="*60)
            logger.info("    MODEL EVALUATION SUMMARY")
            logger.info("="*60)
            logger.info(f"    Overall Accuracy: {metrics['accuracy']*100:.2f}%")
            logger.info(f"    Test Samples: {metrics['test_samples']}")
            logger.info(f" Severity Classes: {', '.join(metrics['severity_classes'])}")
            
            logger.info("\n Classification Report:")
            clean_report = classification_report(
                y_test, 
                y_pred, 
                target_names=metrics['severity_classes'],
                zero_division=0
            )
            logger.info(clean_report)
            
            logger.info("\n Confusion Matrix:")
            logger.info("Rows: Actual, Columns: Predicted")
            logger.info(f"Labels: {metrics['severity_classes']}")
            logger.info(np.array2string(
                np.array(metrics['confusion_matrix']), 
                formatter={'int': lambda x: f'{x:6d}'}
            ))
            
        except Exception as e:
            logger.error(f"   Failed to print evaluation summary: {e}")
            raise e

    def debug_data_issues(self):
        """Debug method to identify data issues"""
        try:
            logger.info(" DEBUG: Data Issue Analysis")
            
            # Load full test data
            test_df = pd.read_csv(self.config.test_data_path)
            
            # Check severity distribution
            severity_counts = test_df['Severity'].value_counts()
            logger.info(f"    Full test data severity distribution:\n{severity_counts}")
            
            # Check for unknown classes
            known_classes = list(self.label_encoder.classes_)
            unknown_classes = set(test_df['Severity'].unique()) - set(known_classes)
            
            if unknown_classes:
                logger.warning(f" Unknown severity classes found: {list(unknown_classes)}")
                logger.warning(f"    Samples with unknown classes: {len(test_df[test_df['Severity'].isin(unknown_classes)])}")
            
            # Check encoded values
            logger.info(f"    Encoded values range: {test_df['encoded_severity'].min()} to {test_df['encoded_severity'].max()}")
            
            return len(unknown_classes) > 0
            
        except Exception as e:
            logger.error(f"   Debug analysis failed: {e}")
            return False

    def evaluate(self):
        """Complete model evaluation pipeline"""
        try:
            # Check if metrics file already exists
            if self.config.metric_file_name.exists():
                logger.info("  Metrics file already exists. Skipping evaluation...")
                return {
                    'status': 'skipped',
                    'message': 'Evaluation already completed',
                    'metrics_file': self.config.metric_file_name
                }
            
            logger.info(" Starting Model Evaluation Pipeline...")
            
            # Step 1: Load artifacts
            self.load_artifacts()
            
            # Step 2: Debug data issues
            has_unknown_classes = self.debug_data_issues()
            
            # Step 3: Load and prepare test data (filters unknown classes)
            X_test, y_test, test_df = self.load_test_data()
            
            # Step 4: Make predictions
            y_pred = self.make_predictions(X_test)
            
            # Step 5: Calculate metrics
            metrics = self.calculate_metrics(y_test, y_pred)
            
            # Step 6: Save metrics
            self.save_metrics(metrics)
            
            # Step 7: Print summary
            self.print_evaluation_summary(metrics, y_test, y_pred)
            
            if has_unknown_classes:
                logger.warning(" Evaluation completed with filtered data (unknown classes removed)")
            else:
                logger.info("  Model Evaluation completed successfully!")
            
            return {
                'status': 'completed',
                'message': 'Evaluation completed successfully',
                'accuracy': metrics['accuracy'],
                'metrics_file': self.config.metric_file_name,
                'test_samples': metrics['test_samples'],
                'severity_classes': metrics['severity_classes'],
                'has_unknown_classes': has_unknown_classes
            }
            
        except Exception as e:
            logger.error(f"   Model evaluation pipeline failed: {e}")
            raise e

In [6]:
# ============================================
#     MODEL EVALUATION PIPELINE
# ============================================

from src.NVDNLP.config.configuration import ConfigurationManager
# from src.NVDNLP.components.ModelEvaluation import ModelEvaluation
from src.NVDNLP import logger

STAGE_NAME = "Model Evaluation stage"

class ModelEvaluationTrainingPipeline:
    def __init__(self):
        pass

    def main(self):
        config = ConfigurationManager()
        model_evaluation_config = config.get_model_evaluation_config()
        model_evaluation = ModelEvaluation(config=model_evaluation_config)
        
        # Evaluate model (will skip if metrics already exist)
        evaluation_result = model_evaluation.evaluate()
        
        return evaluation_result

if __name__ == "__main__":
    try:
        logger.info(f">>>>>> Stage {STAGE_NAME} started <<<<<<")
        obj = ModelEvaluationTrainingPipeline()
        result = obj.main()
        
        if result['status'] == 'completed':
            logger.info(f" Evaluation completed successfully!")
            logger.info(f" Accuracy: {result['accuracy']*100:.2f}%")
            logger.info(f" Test samples: {result['test_samples']}")
            logger.info(f" Metrics saved at: {result['metrics_file']}")
            logger.info(f">>>>>> Stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
        else:
            logger.info(f">>>>>> Stage {STAGE_NAME} skipped (evaluation already completed) <<<<<<\n\nx==========x")
    
    except Exception as e:
        logger.exception(e)
        raise e

[2025-10-22 23:07:01,184: INFO: 390888635: >>>>>> Stage Model Evaluation stage started <<<<<<]
[2025-10-22 23:07:01,192: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-10-22 23:07:01,198: INFO: common: yaml file: params.yaml loaded successfully]
[2025-10-22 23:07:01,200: INFO: common: created directory at: artifacts]
[2025-10-22 23:07:01,202: INFO: common: created directory at: artifacts/model_evaluation]
[2025-10-22 23:07:01,204: INFO: 3855641508:   Metrics file already exists. Skipping evaluation...]
[2025-10-22 23:07:01,207: INFO: 390888635: >>>>>> Stage Model Evaluation stage skipped (evaluation already completed) <<<<<<

