In [1]:
import os
%pwd
os.chdir("../")
%pwd


'd:\\Data Science\\END to END Proj\\DNAseqMLOPS'

In [2]:

## ENTITY
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    transformed_data_dir: Path
    model_dir: Path
    models_params: dict  # Will contain params from params.yaml

In [3]:
from src.DNASeqMLOPS.utils.common import read_yaml, create_directories
from src.DNASeqMLOPS.constant import *

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        models_params = self.params.model_training.models
        
        create_directories([
            Path(config.root_dir),
            Path(config.model_dir)
        ])

        return ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            transformed_data_dir=Path(config.transformed_data_dir),
            model_dir=Path(config.model_dir),
            models_params=models_params
        )

In [9]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
from typing import Dict
from DNASeqMLOPS import logger
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.model_classes = {
            "RandomForest": RandomForestClassifier,
            "SVM": SVC,
            "XGBoost": xgb.XGBClassifier
        }
        
    def train(self):
        """Train models only if no trained models exist yet"""
        model_files_exist = any(
            fname.endswith(".joblib") for fname in os.listdir(self.config.model_dir)
        ) if os.path.exists(self.config.model_dir) else False

        if model_files_exist:
            logger.info(f"Model files already exist in {self.config.model_dir} - skipping training")
            return False
        else:
            os.makedirs(self.config.model_dir, exist_ok=True)
            logger.info("Training models...")

            X = np.load(os.path.join(self.config.transformed_data_dir, "X.npy"))
            y = np.load(os.path.join(self.config.transformed_data_dir, "y.npy"))

            results = {}
            for model_name, params in self.config.models_params.items():
                if model_name in self.model_classes:
                    try:
                        logger.info(f"Training {model_name} with params: {params}")
                        model = self.model_classes[model_name](**params)
                        model.fit(X, y)

                    # Save model
                        model_path = os.path.join(self.config.model_dir, f"{model_name}.joblib")
                        joblib.dump(model, model_path)

                    # Evaluate
                        y_pred = model.predict(X)
                        results[model_name] = {
                        "accuracy": accuracy_score(y, y_pred),
                        "f1": f1_score(y, y_pred),
                        "classification_report": classification_report(y, y_pred)
                        }
                        logger.info(f"{model_name} trained successfully")

                    except Exception as e:
                        logger.error(f"Error training {model_name}: {str(e)}")
                        continue

        # Save evaluation results
            joblib.dump(results, os.path.join(self.config.model_dir, "training_results.joblib"))
            logger.info("Model training completed")
            return True


In [10]:
try:
    config_manager = ConfigurationManager()
    trainer_config = config_manager.get_model_trainer_config()
    trainer = ModelTrainer(trainer_config)
    
    if trainer.train():
        logger.info("New models trained and saved")
    else:
        logger.info("Using existing models")
        
except Exception as e:
    logger.error(f"Model training pipeline failed: {str(e)}")
    raise

[2025-07-25 14:52:58,538: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-25 14:52:58,542: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-25 14:52:58,549: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-07-25 14:52:58,553: INFO: common: created directory at: artifacts]
[2025-07-25 14:52:58,554: INFO: common: created directory at: artifacts\model_trainer]
[2025-07-25 14:52:58,557: INFO: common: created directory at: artifacts\model_trainer\models]
[2025-07-25 14:52:58,557: INFO: 3996164474: Training models...]
[2025-07-25 14:52:59,006: INFO: 3996164474: Training RandomForest with params: {'n_estimators': 200, 'max_depth': 10, 'random_state': 42}]
[2025-07-25 14:55:41,135: INFO: 3996164474: RandomForest trained successfully]
[2025-07-25 14:55:41,135: INFO: 3996164474: Training SVM with params: {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale', 'random_state': 42, 'probability': True}]
[2025-07-25 15:26:33,386: INFO: 3996164474: SVM