In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\DNAseqMLOPS'

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path  # Path to test features (features.npy)
    test_labels_path: Path  # Path to test labels (labels.npy)
    model_dir: Path  # Directory containing trained models
    metric_file_name: Path  # Where to save metrics
    mlflow_uri: str  # MLflow tracking URI
    all_params: dict  # Parameters from params.yaml


In [None]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import joblib
import json
import mlflow
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from DNASeqMLOPS import logger
import os

In [None]:
from src.DNASeqMLOPS.utils.common import read_yaml, create_directories
from src.DNASeqMLOPS.constant import *

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        
        return ModelEvaluationConfig(
            root_dir=Path(config.root_dir),
            test_data_path=Path(config.test_data_path),
            test_labels_path=Path(config.test_labels_path),
            model_dir=Path(config.model_dir),
            metric_file_name=Path(config.metric_file_name),
            mlflow_uri=config.mlflow_uri,
            all_params=self.params.model_training  # All model params from params.yaml
        )

In [None]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def _load_test_data(self):
        """Load test features and labels"""
        try:
            X_test = np.load(self.config.test_data_path)
            y_test = np.load(self.config.test_labels_path)
            return X_test, y_test
        except Exception as e:
            logger.error(f"Error loading test data: {e}")
            raise

    def _load_models(self):
        """Load all trained models"""
        models = {}
        for model_file in os.listdir(self.config.model_dir):
            if model_file.endswith('.joblib'):
                model_name = model_file.split('.')[0]
                model_path = os.path.join(self.config.model_dir, model_file)
                models[model_name] = joblib.load(model_path)
        return models

    def evaluate_models(self):
        """Evaluate all models and return metrics"""
        X_test, y_test = self._load_test_data()
        models = self._load_models()
        
        metrics = {}
        for model_name, model in models.items():
            y_pred = model.predict(X_test)
            
            metrics[model_name] = {
                'accuracy': accuracy_score(y_test, y_pred),
                'f1_score': f1_score(y_test, y_pred),
                'classification_report': classification_report(y_test, y_pred, output_dict=True),
                'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
            }
            logger.info(f"\n{model_name} Evaluation:\n{json.dumps(metrics[model_name], indent=2)}")
        
        return metrics

    def log_into_mlflow(self):
        """Log evaluation results to MLflow"""
        if not os.path.exists(self.config.metric_file_name):
            mlflow.set_tracking_uri(self.config.mlflow_uri)
            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

            with mlflow.start_run():
                # Evaluate models
                metrics = self.evaluate_models()
                
                # Save metrics to file
                with open(self.config.metric_file_name, 'w') as f:
                    json.dump(metrics, f, indent=4)
                
                # Log parameters and metrics
                mlflow.log_params(self.config.all_params)
                for model_name, model_metrics in metrics.items():
                    for metric_name, value in model_metrics.items():
                        if metric_name in ['accuracy', 'f1_score']:
                            mlflow.log_metric(f"{model_name}_{metric_name}", value)
                
                # Log models
                if tracking_url_type_store != "file":
                    for model_file in os.listdir(self.config.model_dir):
                        if model_file.endswith('.joblib'):
                            model_name = model_file.split('.')[0]
                            model_path = os.path.join(self.config.model_dir, model_file)
                            mlflow.sklearn.log_model(
                                sk_model=joblib.load(model_path),
                                artifact_path=f"{model_name}_model",
                                registered_model_name=f"DNA_Seq_{model_name}"
                            )
                
                logger.info("Evaluation results logged to MLflow")
            return True
        else:
            logger.info(f"Metrics file {self.config.metric_file_name} already exists - skipping evaluation")
            return False

In [None]:
try:
    config = ConfigurationManager()
    eval_config = config.get_model_evaluation_config()
    evaluator = ModelEvaluation(eval_config)
    
    if evaluator.log_into_mlflow():
        logger.info("New evaluation performed and logged")
    else:
        logger.info("Using existing evaluation results")
        
except Exception as e:
    logger.error(f"Model evaluation failed: {e}")
    raise