In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\DNAseqMLOPS'

In [2]:
%pip install dagshub
import dagshub
dagshub.init(repo_owner='gowtham-dd', repo_name='DNAseqMLOPS', mlflow=True)

import mlflow
with mlflow.start_run():
  mlflow.log_param('parameter name', 'value')
  mlflow.log_metric('metric name', 1)

Collecting dagshub
  Using cached dagshub-0.6.2-py3-none-any.whl (261 kB)
Collecting appdirs>=1.4.4
  Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting rich>=13.1.0
  Downloading rich-14.1.0-py3-none-any.whl (243 kB)
     -------------------------------------- 243.4/243.4 kB 1.7 MB/s eta 0:00:00
Collecting dacite~=1.6.0
  Using cached dacite-1.6.0-py3-none-any.whl (12 kB)
Collecting tenacity>=8.2.2
  Using cached tenacity-9.1.2-py3-none-any.whl (28 kB)
Collecting gql[requests]
  Using cached gql-3.5.3-py2.py3-none-any.whl (74 kB)
Collecting dataclasses-json
  Using cached dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting treelib>=1.6.4
  Using cached treelib-1.8.0-py3-none-any.whl (30 kB)
Collecting pathvalidate>=3.0.0
  Using cached pathvalidate-3.3.1-py3-none-any.whl (24 kB)
Collecting boto3
  Downloading boto3-1.39.13-py3-none-any.whl (139 kB)
     -------------------------------------- 139.9/139.9 kB 1.4 MB/s eta 0:00:00
Collecting semver
  Using cached s


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    test_labels_path: Path
    model_dir: Path
    metric_file_name: Path
    mlflow_uri: str
    all_params: dict

In [12]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import joblib
import json
import mlflow
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from DNASeqMLOPS import logger
import os

In [13]:
from src.DNASeqMLOPS.utils.common import read_yaml, create_directories
from src.DNASeqMLOPS.constant import *

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        
        return ModelEvaluationConfig(
            root_dir=Path(config.root_dir),
            test_data_path=Path(config.test_data_path),
            test_labels_path=Path(config.test_labels_path),
            model_dir=Path(config.model_dir),
            metric_file_name=Path(config.metric_file_name),
            mlflow_uri=config.mlflow_uri,
            all_params=self.params.model_training  # All model params from params.yaml
        )

In [14]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import joblib
import json
import mlflow
import os
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from DNASeqMLOPS import logger



class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        # Create evaluation directory if it doesn't exist
        os.makedirs(self.config.root_dir, exist_ok=True)

    def _load_test_data(self):
        """Load test features and labels"""
        try:
            X_test = np.load(self.config.test_data_path)
            y_test = np.load(self.config.test_labels_path)
            return X_test, y_test
        except Exception as e:
            logger.error(f"Error loading test data: {e}")
            raise

    def _load_models(self):
        """Load all trained models"""
        models = {}
        try:
            for model_file in os.listdir(self.config.model_dir):
                if model_file.endswith('.joblib'):
                    model_name = model_file.split('.')[0]
                    model_path = os.path.join(self.config.model_dir, model_file)
                    models[model_name] = joblib.load(model_path)
            return models
        except Exception as e:
            logger.error(f"Error loading models: {e}")
            raise

    def evaluate_models(self):
        """Evaluate all models and return metrics"""
        try:
            X_test, y_test = self._load_test_data()
            models = self._load_models()
            
            metrics = {}
            for model_name, model in models.items():
                try:
                    y_pred = model.predict(X_test)
                    
                    metrics[model_name] = {
                        'accuracy': accuracy_score(y_test, y_pred),
                        'f1_score': f1_score(y_test, y_pred),
                        'classification_report': classification_report(y_test, y_pred, output_dict=True),
                        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
                    }
                    logger.info(f"\n{model_name} Evaluation:\n{json.dumps(metrics[model_name], indent=2)}")
                except Exception as e:
                    logger.error(f"Error evaluating {model_name}: {e}")
                    continue
            
            return metrics
        except Exception as e:
            logger.error(f"Evaluation failed: {e}")
            raise

    def _save_metrics(self, metrics):
        """Save metrics to JSON file"""
        try:
            with open(self.config.metric_file_name, 'w') as f:
                json.dump(metrics, f, indent=4)
            logger.info(f"Metrics saved to {self.config.metric_file_name}")
        except Exception as e:
            logger.error(f"Error saving metrics: {e}")
            raise

    def log_into_mlflow(self):
        """Log evaluation results to MLflow"""
        try:
            if not os.path.exists(self.config.metric_file_name):
                mlflow.set_tracking_uri(self.config.mlflow_uri)
                tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

                with mlflow.start_run():
                    # Evaluate models
                    metrics = self.evaluate_models()
                    
                    # Save metrics to file
                    self._save_metrics(metrics)
                    
                    # Log parameters and metrics
                    mlflow.log_params(self.config.all_params)
                    for model_name, model_metrics in metrics.items():
                        for metric_name, value in model_metrics.items():
                            if isinstance(value, (int, float)):
                                mlflow.log_metric(f"{model_name}_{metric_name}", value)
                    
                    # Log models
                    if tracking_url_type_store != "file":
                        models = self._load_models()
                        for model_name, model in models.items():
                            try:
                                if hasattr(model, 'predict'):
                                    mlflow.sklearn.log_model(
                                        sk_model=model,
                                        artifact_path=f"{model_name}_model",
                                        registered_model_name=f"DNA_Seq_{model_name}"
                                    )
                            except Exception as e:
                                logger.error(f"Error logging {model_name} to MLflow: {e}")
                    
                    logger.info("Evaluation results logged to MLflow")
                return True
            else:
                logger.info(f"Metrics file {self.config.metric_file_name} already exists - skipping evaluation")
                return False
        except Exception as e:
            logger.error(f"MLflow logging failed: {e}")
            raise

In [15]:
try:
    config = ConfigurationManager()
    eval_config = config.get_model_evaluation_config()
    evaluator = ModelEvaluation(eval_config)
    
    if evaluator.log_into_mlflow():
        logger.info("New evaluation performed and logged")
    else:
        logger.info("Using existing evaluation results")
        
except Exception as e:
    logger.error(f"Model evaluation failed: {e}")
    raise

[2025-07-25 17:36:14,575: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-25 17:36:14,588: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-25 17:36:14,595: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-25 17:36:14,601: INFO: common: created directory at: artifacts]


[2025-07-25 17:36:20,752: INFO: 3732966409: 
RandomForest Evaluation:
{
  "accuracy": 0.98906,
  "f1_score": 0.9889992760035395,
  "classification_report": {
    "0": {
      "precision": 0.9849472162253164,
      "recall": 0.9933284062081777,
      "f1-score": 0.9891200572837935,
      "support": 50063.0
    },
    "1": {
      "precision": 0.9932540243582234,
      "recall": 0.9847808238380359,
      "f1-score": 0.9889992760035395,
      "support": 49937.0
    },
    "accuracy": 0.98906,
    "macro avg": {
      "precision": 0.9891006202917699,
      "recall": 0.9890546150231068,
      "f1-score": 0.9890596666436665,
      "support": 100000.0
    },
    "weighted avg": {
      "precision": 0.9890953870026462,
      "recall": 0.98906,
      "f1-score": 0.9890597427358732,
      "support": 100000.0
    }
  },
  "confusion_matrix": [
    [
      49729,
      334
    ],
    [
      760,
      49177
    ]
  ]
}]
[2025-07-25 17:38:54,632: INFO: 3732966409: 
SVM Evaluation:
{
  "accuracy": 

Successfully registered model 'DNA_Seq_RandomForest'.
2025/07/25 17:41:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: DNA_Seq_RandomForest, version 1
Created version '1' of model 'DNA_Seq_RandomForest'.
Successfully registered model 'DNA_Seq_SVM'.
2025/07/25 17:41:43 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: DNA_Seq_SVM, version 1
Created version '1' of model 'DNA_Seq_SVM'.
Successfully registered model 'DNA_Seq_XGBoost'.
2025/07/25 17:42:00 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: DNA_Seq_XGBoost, version 1


[2025-07-25 17:42:00,698: INFO: 3732966409: Evaluation results logged to MLflow]


Created version '1' of model 'DNA_Seq_XGBoost'.


[2025-07-25 17:42:01,330: INFO: 3201019539: New evaluation performed and logged]
