In [None]:
# Defining a DAG for model training
# Components:
# 1. Data ingestion
# 2. Data preprocessing
# 3. Model training
# 4. Model evaluation
# 5. Model deployment

In [13]:
%%writefile trainingFlow.py

# Importing libraries related with metaflow
from metaflow import FlowSpec, step, Parameter, conda, batch

import optuna
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import datetime

def setup_env():
    import os

    os.environ["AWS_ACCESS_KEY_ID"] = "minio"
    os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123"
    os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
    os.environ["MYSQL_DATABASE"] = "mlflow_database"
    os.environ["MYSQL_USER"] = "mlflow_user"
    os.environ["MYSQL_PASSWORD"] = "mlflow"
    os.environ["MYSQL_ROOT_PASSWORD"] = "mysql"

    # requirements = {
    #         "boto3":"1.26.37",
    #         "mlflow":"2.1.1",
    #         "scikit-learn":"1.2.0",
    #         "pandas":"1.5.2",
    #         "optuna":"3.0.5",
    #     }
    import mlflow
    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("mlflow-tutorial")



def get_data():
    from sklearn import datasets

    X, y = datasets.make_classification(n_samples=1000, n_features=4, n_informative=4, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=0.5, random_state=40)
    return X, y

class trainingFlow(FlowSpec):
    @step
    def start(self):
        self.next(self.ingest_data)
    
    #@conda(libraries=requirements, python='3.9.5')
    @step
    def ingest_data(self):
        self.X, self.y = get_data()
        self.next(self.hyperparameter_tuning)
        
    
    #@conda(libraries=requirements,python='3.9.5')
    @step
    def hyperparameter_tuning(self):
        setup_env()
        def optimize_rf(trial):
            with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
                # Set the hyperparameter values that we want to optimize
                n_estimators = trial.suggest_int('n_estimators', 1, 100)
                max_depth = trial.suggest_int('max_depth', 2, 10)
                min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
                min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
                max_features = trial.suggest_float('max_features', 0.1, 1.0)
                
                
                # Create a random forest classifier using the suggested hyperparameters
                rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            min_samples_leaf=min_samples_leaf,
                                            max_features=max_features)
                
                # Use cross-validation to evaluate the performance of the classifier
                scores = cross_val_score(rf, self.X, self.y, cv=5)
                
                # Log the hyperparameters and cross-validation scores to MLflow
                mlflow.log_param('n_estimators', n_estimators)
                mlflow.log_param('max_depth', max_depth)
                mlflow.log_param('min_samples_split', min_samples_split)
                mlflow.log_param('min_samples_leaf', min_samples_leaf)
                mlflow.log_param('max_features', max_features)
                mlflow.log_metric('mean_cv_score', scores.mean())
            
            # Return the mean of the cross-validation scores as the objective value
            return scores.mean()
        
        # Create an Optuna study
        study = optuna.create_study()

        # Run the optimization loop
        study.optimize(optimize_rf, n_trials=100)

        # Get the best hyperparameter values
        self.best_params = study.best_params
        
        self.next(self.train_final_model)
        
    
    #@conda(libraries=requirements,python='3.9.5')
    @step
    def train_final_model(self):
        setup_env()
        with mlflow.start_run(run_name=f"optuna-hp-final"):
            # Create the final model using the best hyperparameters
            final_model = RandomForestClassifier(**self.best_params)

            # Train the final model on the entire dataset
            final_model.fit(self.X, self.y)

            # Log the model to the "Models" section
            mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model")

        self.next(self.end)
        
    @step
    def end(self):
        pass
    
if __name__ == '__main__':
    trainingFlow()



Overwriting trainingFlow.py


In [None]:
# docker run -p 3000:3000 -e METAFLOW_SERVICE=http://localhost:8083/ metaflow-ui:latest