In [None]:
# Defining a DAG for model training
# Components:
# 1. Data ingestion
# 2. Data preprocessing
# 3. Model training
# 4. Model evaluation
# 5. Model deployment

In [10]:
%%writefile trainingFlow.py

# Importing libraries related with metaflow
from metaflow import FlowSpec, step, Parameter, conda, batch

import optuna
import mlflow
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
import datetime
from mlopshelpers.data.dataset import get_data, split_data, preprocess_data
from mlopshelpers.training.model_train import train_random_forest, cv_random_forest
from mlopshelpers.evaluation.evaluate import evaluate_classification

def setup_env():
    import os

    # os.environ["AWS_ACCESS_KEY_ID"] = "minio"
    # os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
    # os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
    # os.environ["MYSQL_DATABASE"] = "mlflow_database"
    # os.environ["MYSQL_USER"] = "mlflow_user"
    # os.environ["MYSQL_PASSWORD"] = "mlflow"
    # os.environ["MYSQL_ROOT_PASSWORD"] = "mysql"

    # requirements = {
    #         "boto3":"1.26.37",
    #         "mlflow":"2.1.1",
    #         "scikit-learn":"1.2.0",
    #         "pandas":"1.5.2",
    #         "optuna":"3.0.5",
    #     }
    import mlflow
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("mlflow-tutorial")

class trainingFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.ingest_data)
    
    #@conda(libraries=requirements, python='3.9.5')
    @step
    def ingest_data(self):
        self.X, self.y = get_data("iris")
        self.next(self.data_split)
    
    @step
    def data_split(self):
        self.X_train, self.X_test, self.y_train, self.y_test = split_data(self.X, self.y)
        self.next(self.data_preprocessing)
        
    @step
    def data_preprocessing(self):
        self.X_train, self.X_test, _ = preprocess_data(self.X_train, self.X_test, 
                                                       continuous_features=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 
                                                       categorical_features=[])
        self.next(self.hyperparameter_tuning)
    
    #@conda(libraries=requirements,python='3.9.5')
    @step
    def hyperparameter_tuning(self):
        setup_env()
        def optimize_rf(trial):
            with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
                # Set the hyperparameter values that we want to optimize
                n_estimators = trial.suggest_int('n_estimators', 1, 100)
                max_depth = trial.suggest_int('max_depth', 2, 10)
                min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
                min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
                max_features = trial.suggest_float('max_features', 0.1, 1.0)
                
                
                score = cv_random_forest(self.X_train, self.y_train, 
                                 {'n_estimators': n_estimators,
                                  'max_depth': max_depth,
                                  'min_samples_split': min_samples_split,
                                  'min_samples_leaf': min_samples_leaf,
                                  'max_features': max_features}
                                                  )
                
                # Log the hyperparameters and cross-validation scores to MLflow
                mlflow.log_param('n_estimators', n_estimators)
                mlflow.log_param('max_depth', max_depth)
                mlflow.log_param('min_samples_split', min_samples_split)
                mlflow.log_param('min_samples_leaf', min_samples_leaf)
                mlflow.log_param('max_features', max_features)
                mlflow.log_metric('mean_cv_score', score)
            
            # Return the mean of the cross-validation scores as the objective value
            return score
        
        # Create an Optuna study
        study = optuna.create_study()

        # Run the optimization loop
        study.optimize(optimize_rf, n_trials=100)

        # Get the best hyperparameter values
        self.best_params = study.best_params
        
        self.next(self.train_final_model)
        
    
    #@conda(libraries=requirements,python='3.9.5')
    @step
    def train_final_model(self):
        setup_env()
        with mlflow.start_run(run_name=f"optuna-hp-final"):
            # Create the final model using the best hyperparameters
            final_model = train_random_forest(self.X_train, self.y_train, self.best_params)
            
            y_pred = final_model.predict(self.X_test)
            
            metrics = evaluate_classification(self.y_test, y_pred, "macro")
            
            # Log hyperparameters to MLflow
            for param_name, param_value in self.best_params.items():
                mlflow.log_param(param_name, param_value)
            
            # Log metrics to MLflow
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(metric_name, metric_value)

            # Log the model to the "Models" section
            mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model")

        self.next(self.end)
        
    @step
    def end(self):
        pass
    
if __name__ == '__main__':
    trainingFlow()



Overwriting trainingFlow.py


In [None]:
# poetry shell
# python trainingFlow.py
# python trainingFlow.py run

In [None]:
# docker run -p 3000:3000 -e METAFLOW_SERVICE=http://localhost:8083/ metaflow-ui:latest