In [1]:
%cd ../notebooks/freethrow_predictions

/workspaces/spl_freethrow_biomechanics_analysis_ml_prediction/notebooks/freethrow_predictions


In [5]:
%%writefile ml/mlflow/mlflow_logger.py
import mlflow
import mlflow.sklearn
from mlflow.data import from_pandas
import os
import logging
import pandas as pd

class MLflowLogger:
    def __init__(self, tracking_uri=None, experiment_name="Default Experiment", enable_mlflow=True):
        self.enable_mlflow = enable_mlflow
        self.logger = logging.getLogger(__name__)
        if not self.logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.INFO)

        if self.enable_mlflow:
            if tracking_uri:
                mlflow.set_tracking_uri(tracking_uri)
                self.logger.info(f"MLflow Tracking URI set to: {tracking_uri}")
            else:
                self.logger.warning("MLflow Tracking URI not provided. Using default.")
            self.experiment_name = experiment_name
            mlflow.set_experiment(self.experiment_name)
            self.logger.info(f"MLflow Experiment set to: {self.experiment_name}")

    def run_context(self, run_name, nested=False, tags=None):
        """
        Returns a context manager wrapping mlflow.start_run(...).
        """
        if self.enable_mlflow:
            return mlflow.start_run(run_name=run_name, nested=nested, tags=tags)
        else:
            # Dummy context manager when MLflow is disabled.
            from contextlib import nullcontext
            return nullcontext()

    def log_run(self, run_name, params, metrics, artifacts=None, tags=None, datasets=None, nested=False):
        """
        Log a single run with MLflow or basic logging.

        Args:
            run_name (str): Name of the run.
            params (dict): Parameters to log.
            metrics (dict): Metrics to log.
            artifacts (str or list): Paths to artifacts (files or directories).
            tags (dict): Tags to set for the run.
            datasets (list): List of datasets to log.
            nested (bool): Whether this run is nested under an active run.
        """
        if self.enable_mlflow:
            try:
                with mlflow.start_run(run_name=run_name, nested=nested):
                    self.logger.info(f"Started MLflow run: {run_name}")
                    
                    # Log parameters
                    if params:
                        mlflow.log_params(params)
                        self.logger.debug(f"Logged parameters: {params}")
                    
                    # Log metrics
                    if metrics:
                        for key, value in metrics.items():
                            if isinstance(value, (list, tuple)):  # Handle multiple values (e.g., per epoch)
                                for step, metric_value in enumerate(value):
                                    mlflow.log_metric(key, metric_value, step=step)
                                    self.logger.debug(f"Logged metric '{key}' at step {step}: {metric_value}")
                            else:
                                mlflow.log_metric(key, value)
                                self.logger.debug(f"Logged metric '{key}': {value}")
                    
                    # Log tags
                    if tags:
                        mlflow.set_tags(tags)
                        self.logger.debug(f"Set tags: {tags}")
                    
                    # Log artifacts
                    if artifacts:
                        if isinstance(artifacts, str):  # Single file or directory
                            mlflow.log_artifact(artifacts)
                            self.logger.debug(f"Logged artifact: {artifacts}")
                        elif isinstance(artifacts, list):  # Multiple paths
                            for artifact_path in artifacts:
                                mlflow.log_artifact(artifact_path)
                                self.logger.debug(f"Logged artifact: {artifact_path}")
                    
                    # Log datasets
                    if datasets:
                        for dataset in datasets:
                            self.log_datasets(dataset["dataframe"], dataset["source"], dataset["name"])
                    
            except Exception as e:
                self.logger.error(f"Failed to log MLflow run '{run_name}': {e}")
                raise
        else:
            # Basic Logging
            self.logger.info(f"Run Name: {run_name}")
            if params:
                self.logger.info(f"Parameters: {params}")
            if metrics:
                self.logger.info(f"Metrics: {metrics}")
            if tags:
                self.logger.info(f"Tags: {tags}")
            if artifacts:
                self.logger.info(f"Artifacts: {artifacts}")
            if datasets:
                for dataset in datasets:
                    self.logger.info(f"Dataset Logged: {dataset['name']} from {dataset['source']}")

    def log_model(self, model, model_name, conda_env=None):
        """
        Log a model to MLflow or perform basic logging.

        Args:
            model: Trained model object.
            model_name (str): Name of the model.
            conda_env (str): Path to a Conda environment file.
        """
        if self.enable_mlflow:
            try:
                mlflow.sklearn.log_model(model, artifact_path=model_name, conda_env=conda_env)
                self.logger.info(f"Model '{model_name}' logged to MLflow.")
            except Exception as e:
                self.logger.error(f"Failed to log model '{model_name}' to MLflow: {e}")
                raise
        else:
            # Basic Logging
            self.logger.info(f"Model '{model_name}' training complete. (MLflow logging disabled)")

    def log_datasets(self, dataframe, source, name):
        """
        Create a dataset log entry or perform basic logging.

        Args:
            dataframe (pd.DataFrame): The dataset.
            source (str): Data source (e.g., file path, S3 URI).
            name (str): Dataset name.

        Returns:
            Dataset object logged to MLflow or None.
        """
        if self.enable_mlflow:
            try:
                dataset = from_pandas(dataframe, source=source, name=name)
                mlflow.log_input(dataset)
                self.logger.info(f"Dataset '{name}' logged to MLflow from source '{source}'.")
                return dataset
            except Exception as e:
                self.logger.error(f"Failed to log dataset '{name}' to MLflow: {e}")
                raise
        else:
            # Basic Logging
            self.logger.info(f"Dataset '{name}' from source '{source}' logged. (MLflow logging disabled)")
            return None

    def get_active_run(self):
        """
        Retrieve the current active MLflow run or log a message.

        Returns:
            Active run info object or None.
        """
        if self.enable_mlflow:
            return mlflow.active_run()
        else:
            self.logger.info("No active MLflow run. MLflow logging is disabled.")
            return None

    def get_last_run(self):
        """
        Retrieve the last active MLflow run or log a message.

        Returns:
            Last run info object or None.
        """
        if self.enable_mlflow:
            return mlflow.last_active_run()
        else:
            self.logger.info("No last MLflow run available. MLflow logging is disabled.")
            return None


Overwriting ml/mlflow/mlflow_logger.py


In [6]:
%%writefile ml/train_utils/model_savers.py
import joblib
from pathlib import Path
import mlflow.sklearn

class LocalModelSaver:
    def __init__(self, save_dir: Path):
        self.save_dir = save_dir
        self.save_dir.mkdir(parents=True, exist_ok=True)

    def save(self, model, model_name: str):
        model_path = self.save_dir / f"{model_name}_model.pkl"
        joblib.dump(model, model_path)
        return model_path

    def load(self, model_name: str):
        model_path = self.save_dir / f"{model_name}_model.pkl"
        return joblib.load(model_path)

class MLflowModelSaver:
    def __init__(self, mlflow_logger, artifact_path: str = "model"):
        self.mlflow_logger = mlflow_logger
        self.artifact_path = artifact_path

    def save(self, model, model_name: str, conda_env: str = None):
        mlflow.sklearn.log_model(model, artifact_path=model_name, conda_env=conda_env)
        # Optionally return the model URI if needed
        model_uri = f"models:/{model_name}/1"
        return model_uri

    def load(self, model_uri: str):
        return mlflow.sklearn.load_model(model_uri)


Overwriting ml/train_utils/model_savers.py


In [9]:
%%writefile ml/jobs/tuning_job.py
import logging
from pathlib import Path
import json
import pandas as pd

from ml.config.config_loader import load_config
from ml.config.config_models import AppConfig
from ml.train_utils.train_utils import bayes_best_model_train
from ml.mlflow.mlflow_logger import MLflowLogger

# Import the feature metadata loader
from ml.feature_selection.feature_importance_calculator import manage_features

class TuningJob:
    def __init__(self, config_path: Path, mlflow_logger: MLflowLogger):
        self.config_path = config_path
        self.config: AppConfig = load_config(config_path)
        self.logger = logging.getLogger(__name__)
        self.mlflow_logger = mlflow_logger

    def run(self):
        # Extract configurations (paths, models, etc.)
        paths_config = self.config.paths
        model_save_dir = Path(paths_config.model_save_base_dir).resolve()
        classification_report_path = model_save_dir / "classification_report.txt"
        tuning_results_save_path = model_save_dir / "tuning_results.json"

        # Load your training data – adjust as needed
        raw_data_file = Path(paths_config.data_dir).resolve() / paths_config.raw_data
        try:
            df = pd.read_csv(raw_data_file)
            self.logger.info(f"Loaded data from {raw_data_file}.")
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {e}")
            return

        # Load feature metadata using manage_features (same as in train.py)
        feature_paths = {
            'features': '../../data/preprocessor/features_info/final_ml_df_selected_features_columns.pkl',
            'ordinal_categoricals': '../../data/preprocessor/features_info/ordinal_categoricals.pkl',
            'nominal_categoricals': '../../data/preprocessor/features_info/nominal_categoricals.pkl',
            'numericals': '../../data/preprocessor/features_info/numericals.pkl',
            'y_variable': '../../data/preprocessor/features_info/y_variable.pkl'
        }
        loaded = manage_features(mode='load', paths=feature_paths)
        if loaded:
            y_var = loaded.get('y_variable')
        else:
            self.logger.error("Failed to load feature metadata.")
            return

        # Now initialize the DataPreprocessor with actual feature metadata (instead of blank lists)
        from datapreprocessor import DataPreprocessor
        preprocessor = DataPreprocessor(
            model_type="Tree Based Classifier",
            y_variable=y_var,
            ordinal_categoricals=loaded.get('ordinal_categoricals'),
            nominal_categoricals=loaded.get('nominal_categoricals'),
            numericals=loaded.get('numericals'),
            mode='train',
            debug=self.config.logging.debug,
            normalize_debug=False,
            normalize_graphs_output=False,
            graphs_output_dir=Path(paths_config.plots_output_dir),
            transformers_dir=Path(paths_config.transformers_save_base_dir)
        )
        try:
            X_train, X_test, y_train, y_test, *_ = preprocessor.final_preprocessing(df)
        except Exception as e:
            self.logger.error(f"Error during preprocessing: {e}")
            return

        # Wrap the tuning job in an MLflow run context
        with self.mlflow_logger.run_context("Tuning Job"):
            try:
                bayes_best_model_train(
                    X_train=X_train,
                    y_train=y_train,
                    X_test=X_test,
                    y_test=y_test,
                    selection_metric=self.config.models.selection_metric,
                    model_save_dir=model_save_dir,
                    classification_save_path=classification_report_path,
                    tuning_results_save=tuning_results_save_path,
                    selected_models=self.config.models.selected_models,
                    use_pca=True
                )
            except Exception as e:
                self.logger.error(f"Tuning failed: {e}")
                raise

        self.logger.info("Tuning job completed successfully.")

if __name__ == "__main__":
    import sys
    from pathlib import Path
    import logging
    logging.basicConfig(level=logging.INFO)
    # Change the tracking URI from "databricks" to the local folder.
    abs_tracking_uri = "file:///" + str(Path("../../data/model/mlruns").resolve())
    mlflow_logger = MLflowLogger(tracking_uri=abs_tracking_uri, experiment_name="SPL Feedback Experiment")
    print("Using absolute MLflow Tracking URI:", abs_tracking_uri)

    config_path = Path('../../data/model/preprocessor_config/preprocessor_config.yaml')
    job = TuningJob(config_path, mlflow_logger)
    job.run()


Overwriting ml/jobs/tuning_job.py


In [11]:
%%writefile ml/jobs/training_job.py
import logging
from pathlib import Path
import json
import pandas as pd

from ml.config.config_loader import load_config
from ml.config.config_models import AppConfig
from ml.train_utils.train_utils import save_model, load_model, bayes_best_model_train
from ml.mlflow.mlflow_logger import MLflowLogger

# Import feature metadata loader
from ml.feature_selection.feature_importance_calculator import manage_features
from datapreprocessor import DataPreprocessor

class TrainingJob:
    def __init__(self, config_path: Path, mlflow_logger: MLflowLogger):
        self.config_path = config_path
        self.config: AppConfig = load_config(config_path)
        self.logger = logging.getLogger(__name__)
        self.mlflow_logger = mlflow_logger

    def run(self):
        # Extract configurations (paths, models, etc.)
        paths_config = self.config.paths
        model_save_dir = Path(paths_config.model_save_base_dir).resolve()
        classification_report_path = model_save_dir / "classification_report.txt"
        tuning_results_save_path = model_save_dir / "tuning_results.json"

        # Load the complete training data
        raw_data_file = Path(paths_config.data_dir).resolve() / paths_config.raw_data
        try:
            df = pd.read_csv(raw_data_file)
            self.logger.info(f"Loaded data from {raw_data_file}. Shape: {df.shape}")
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {e}")
            return

        # Load feature metadata using manage_features (same as in train.py)
        feature_paths = {
            'features': '../../data/preprocessor/features_info/final_ml_df_selected_features_columns.pkl',
            'ordinal_categoricals': '../../data/preprocessor/features_info/ordinal_categoricals.pkl',
            'nominal_categoricals': '../../data/preprocessor/features_info/nominal_categoricals.pkl',
            'numericals': '../../data/preprocessor/features_info/numericals.pkl',
            'y_variable': '../../data/preprocessor/features_info/y_variable.pkl'
        }
        loaded_features = manage_features(mode='load', paths=feature_paths)
        if loaded_features:
            y_var = loaded_features.get('y_variable')
        else:
            self.logger.error("Failed to load feature metadata.")
            return

        # Initialize the DataPreprocessor with feature metadata (for training mode)
        preprocessor = DataPreprocessor(
            model_type="Tree Based Classifier",
            y_variable=y_var,
            ordinal_categoricals=loaded_features.get('ordinal_categoricals'),
            nominal_categoricals=loaded_features.get('nominal_categoricals'),
            numericals=loaded_features.get('numericals'),
            mode='train',
            debug=self.config.logging.debug,
            normalize_debug=False,
            normalize_graphs_output=False,
            graphs_output_dir=Path(paths_config.plots_output_dir),
            transformers_dir=Path(paths_config.transformers_save_base_dir)
        )
        try:
            # Process the data to obtain training and test splits
            X_train, X_test, y_train, y_test, *_ = preprocessor.final_preprocessing(df)
            self.logger.info(f"Preprocessing complete. X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
        except Exception as e:
            self.logger.error(f"Error during preprocessing: {e}")
            return

        # Load tuning results to get best model info
        try:
            with tuning_results_save_path.open("r") as f:
                tuning_results = json.load(f)
            best_model_info = tuning_results.get("Best Model")
            if not best_model_info:
                self.logger.error("No best model found in tuning results.")
                return
            best_model_name = best_model_info.get("model_name")
            self.logger.info(f"Best model selected from tuning: {best_model_name}")
        except Exception as e:
            self.logger.error(f"Error loading tuning results: {e}")
            return

        # Load the best model from disk
        try:
            model = load_model(best_model_name, model_save_dir)
            self.logger.info(f"Loaded best model '{best_model_name}' from disk.")
        except Exception as e:
            self.logger.error(f"Error loading model '{best_model_name}': {e}")
            return

        # Retrain the best model on the full training data
        try:
            model.fit(X_train, y_train)
            self.logger.info(f"Model '{best_model_name}' retrained on full training data.")
        except Exception as e:
            self.logger.error(f"Error during retraining of model '{best_model_name}': {e}")
            return

        # Save the retrained model
        try:
            save_model(model, best_model_name, save_dir=model_save_dir)
            self.logger.info(f"Retrained model '{best_model_name}' saved successfully.")
        except Exception as e:
            self.logger.error(f"Failed to save retrained model '{best_model_name}': {e}")
            return

        # Optionally, log the retrained model to MLflow
        with self.mlflow_logger.run_context("Training Job"):
            try:
                self.mlflow_logger.log_model(model, best_model_name)
            except Exception as e:
                self.logger.error(f"Failed to log retrained model '{best_model_name}' to MLflow: {e}")
                return

        self.logger.info("Training job completed successfully.")

if __name__ == "__main__":
    import sys
    from pathlib import Path
    import logging
    logging.basicConfig(level=logging.INFO)
    # Change the tracking URI from "databricks" to the local folder.
    abs_tracking_uri = "file:///" + str(Path("../../data/model/mlruns").resolve())
    mlflow_logger = MLflowLogger(tracking_uri=abs_tracking_uri, experiment_name="SPL Feedback Experiment")
    print("Using absolute MLflow Tracking URI:", abs_tracking_uri)

    config_path = Path('../../data/model/preprocessor_config/preprocessor_config.yaml')
    job = TrainingJob(config_path, mlflow_logger)
    job.run()


Overwriting ml/jobs/training_job.py


In [12]:
%%writefile ml/jobs/inference_job.py
import logging
from pathlib import Path
import pandas as pd
import json

from ml.config.config_loader import load_config
from ml.config.config_models import AppConfig
from ml.mlflow.mlflow_logger import MLflowLogger
from ml.train_utils.train_utils import load_model

# Import feature metadata loader
from ml.feature_selection.feature_importance_calculator import manage_features
from datapreprocessor import DataPreprocessor
from ml.predict.predict import predict_and_attach_predict_probs

class InferenceJob:
    def __init__(self, config_path: Path, mlflow_logger: MLflowLogger):
        self.config_path = config_path
        self.config: AppConfig = load_config(config_path)
        self.logger = logging.getLogger(__name__)
        self.mlflow_logger = mlflow_logger

    def run(self):
        paths_config = self.config.paths
        model_save_dir = Path(paths_config.model_save_base_dir).resolve()
        tuning_results_path = model_save_dir / "tuning_results.json"
        try:
            with tuning_results_path.open("r") as f:
                tuning_results = json.load(f)
            best_model_info = tuning_results.get("Best Model")
            if not best_model_info:
                self.logger.error("No best model info found in tuning results.")
                return
            best_model_name = best_model_info.get("model_name")
            self.logger.info(f"Best model for inference: {best_model_name}")
        except Exception as e:
            self.logger.error(f"Error loading tuning results: {e}")
            return

        # Load the prediction dataset
        raw_data_file = Path(paths_config.data_dir).resolve() / paths_config.raw_data
        try:
            df = pd.read_csv(raw_data_file)
            self.logger.info(f"Loaded prediction data from {raw_data_file}.")
        except Exception as e:
            self.logger.error(f"Error loading prediction data: {e}")
            return

        # Load feature metadata using manage_features for preprocessing in prediction mode
        feature_paths = {
            'features': '../../data/preprocessor/features_info/final_ml_df_selected_features_columns.pkl',
            'ordinal_categoricals': '../../data/preprocessor/features_info/ordinal_categoricals.pkl',
            'nominal_categoricals': '../../data/preprocessor/features_info/nominal_categoricals.pkl',
            'numericals': '../../data/preprocessor/features_info/numericals.pkl',
            'y_variable': '../../data/preprocessor/features_info/y_variable.pkl'
        }
        loaded_features = manage_features(mode='load', paths=feature_paths)
        if not loaded_features:
            self.logger.error("Failed to load feature metadata.")
            return

        # Initialize DataPreprocessor for prediction (using the loaded feature metadata)
        preprocessor = DataPreprocessor(
            model_type="Tree Based Classifier",
            y_variable=loaded_features.get('y_variable'),  # may be optional in prediction mode
            ordinal_categoricals=loaded_features.get('ordinal_categoricals'),
            nominal_categoricals=loaded_features.get('nominal_categoricals'),
            numericals=loaded_features.get('numericals'),
            mode='predict',
            options={},  # any extra options for prediction
            debug=self.config.logging.debug,
            normalize_debug=False,
            normalize_graphs_output=False,
            graphs_output_dir=Path(paths_config.plots_output_dir),
            transformers_dir=Path(paths_config.transformers_save_base_dir)
        )
        try:
            X_preprocessed, recommendations, X_inversed = preprocessor.final_preprocessing(df)
            self.logger.info(f"Preprocessing complete for inference. Processed data shape: {X_preprocessed.shape}")
        except Exception as e:
            self.logger.error(f"Error during preprocessing: {e}")
            return

        # Load the best model from disk
        try:
            trained_model = load_model(best_model_name, model_save_dir)
            self.logger.info(f"Loaded model '{best_model_name}' successfully.")
        except Exception as e:
            self.logger.error(f"Error loading model '{best_model_name}': {e}")
            return

        # Make predictions and optionally log them via MLflow
        with self.mlflow_logger.run_context("Inference Job"):
            try:
                predictions, prediction_probs, X_inversed = predict_and_attach_predict_probs(
                    trained_model, X_preprocessed, X_inversed
                )
            except Exception as e:
                self.logger.error(f"Error during prediction: {e}")
                return

            # Save predictions to a CSV file
            predictions_output_dir = Path(paths_config.predictions_output_dir).resolve()
            predictions_output_dir.mkdir(parents=True, exist_ok=True)
            predictions_filename = predictions_output_dir / f'predictions_{best_model_name.replace(" ", "_")}.csv'
            try:
                X_inversed.to_csv(predictions_filename, index=False)
                self.logger.info(f"Predictions saved to {predictions_filename}")
            except Exception as e:
                self.logger.error(f"Error saving predictions: {e}")

if __name__ == "__main__":
    import sys
    from pathlib import Path
    import logging
    logging.basicConfig(level=logging.INFO)
    # Change the tracking URI from "databricks" to the local folder.
    abs_tracking_uri = "file:///" + str(Path("../../data/model/mlruns").resolve())
    mlflow_logger = MLflowLogger(tracking_uri=abs_tracking_uri, experiment_name="SPL Feedback Experiment")
    print("Using absolute MLflow Tracking URI:", abs_tracking_uri)

    config_path = Path('../../data/model/preprocessor_config/preprocessor_config.yaml')
    job = InferenceJob(config_path, mlflow_logger)
    job.run()


Overwriting ml/jobs/inference_job.py
