In [0]:
%pip install mlflow numpy scikit-learn xgboost hyperopt

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
# import libraries
from sklearn.model_selection import train_test_split
import pandas as pd
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import cloudpickle
import time
from mlflow.tracking import MlflowClient
import mlflow.xgboost
from xgboost import XGBClassifier
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). 
# The following code creates a wrapper function, SklearnModelWrapper, that uses 
# the predict_proba method to return the probability that the observation belongs to each class. 

    
# take the data path as a parameter
dbutils.widgets.text("data_path", "")
data_path = dbutils.widgets.get("data_path")
data = pd.read_csv(data_path)

# Split input and output values
X = data.drop(["quality"], axis=1)
y = data.quality

# Split out the training data
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)

# Split the remaining data equally into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)

In [0]:


# Set the MLflow experiment name
mlflow.set_experiment("/Users/ganendra.vadlamu@tigeranalytics.com/wine_quality")

current_run = mlflow.active_run()

if current_run is not None:
    # Get the current run ID
    run_id = current_run.info.run_id

    # Get the previous run's name
    previous_run_name = mlflow.get_run(run_id).data.tags.get("mlflow.runName")

    # Increment the run name
    if previous_run_name:
        # Extract the run number from the previous run's name
        previous_run_number = int(previous_run_name.split("_")[-1])

        # Increment the run number
        new_run_number = previous_run_number + 1

        # Create the new run name
        new_run_name = f"xgboost_run_{new_run_number}"
    else:
        # If there is no previous run name, start with Run_1
        new_run_name = "xgboost_run_1"

    # Set the new run name
    mlflow.set_tag("mlflow.runName", new_run_name)

    run_name = new_run_name
else:
    # Start a new MLflow run if there is no active run
    run_name = "xgboost_run_1"

# Function to train an XGBoost model
def train_xgboost(params):
    train = xgb.DMatrix(data=X_train, label=y_train)
    validation = xgb.DMatrix(data=X_test, label=y_test)
    model = xgb.train(params=params, dtrain=train, num_boost_round=2, evals=[(validation, "validation")],
                      early_stopping_rounds=2)
    predictions = model.predict(validation)
    auc = roc_auc_score(y_test, predictions)
    return {'status': STATUS_OK, 'loss': -auc, 'model': model}


# Define the search space for hyperparameters
search_space = {
    'max_depth': hp.randint('max_depth', 1, 11),  # Set the range for max_depth
}

# Start a new MLflow run
with mlflow.start_run():
    # Perform hyperparameter tuning using Bayesian optimization
    best = fmin(fn=train_xgboost, space=search_space, algo=tpe.suggest, max_evals=1, trials=Trials())
    print(best)
    # Retrieve the best hyperparameters and train the final model
    best_max_depth = int(best['max_depth'])

    best_params = {
        'max_depth': best_max_depth,
    }

    best_model = train_xgboost(best_params)['model']
    xgb_best_auc=train_xgboost(best_params)['loss']
    # Log the best hyperparameters and model
    mlflow.log_params(best)
    mlflow.xgboost.log_model(best_model, "model")   
    mlflow.log_metric("auc", xgb_best_auc)
    
    dmatrix = xgb.DMatrix(X_test)
    # Compare the AUC score of the current production model and the best model
    production_model = mlflow.pyfunc.load_model("models:/wine_quality/production")
    production_auc = roc_auc_score(y_test, production_model.predict(X_test))
    best_model_auc = roc_auc_score(y_test, best_model.predict(dmatrix))
        # If the best model has a higher AUC score than the production model, archive the production model
    if best_model_auc > production_auc:
        production_model_version = mlflow.search_registered_models("wine_quality")[0].latest_versions[0]
        mlflow.register_model(f"runs:/{production_model_version.source}/model", "archived_production_model")

        # Transition the best model to production
        best_model_version = mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/model", "production")
        mlflow.pyfunc.update_registered_model("wine_quality", production_model_version.version, stage="archived")
        mlflow.pyfunc.update_registered_model("wine_quality", best_model_version.version, stage="production")


   

    time.sleep(15)

    # To simulate a new corpus of data, save the existing X_train data to a Delta table. 
    # In the real world, this would be a new batch of data.
    spark_df = spark.createDataFrame(X_train)
    # Replace <username> with your username before running this cell.
    table_path = "dbfs:/ganendra.vadlamu@tigeranalytics.com/delta/wine_data"
    # Delete the contents of this path in case this cell has already been run
    dbutils.fs.rm(table_path, True)
    spark_df.write.format("delta").save(table_path)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]                                                     [0]	validation-rmse:0.42511
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]                                                     [1]	validation-rmse:0.38254
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]100%|██████████| 1/1 [00:00<00:00, 26.59trial/s, best loss: -0.8354791517001917]
{'max_depth': 7}
[0]	validation-rmse:0.42511
[1]	validation-rmse:0.38254
[0]	validation-rmse:0.42511
[1]	validation-rmse:0.38254




In [0]:
"""
import mlflow.pyfunc

# Specify the name of the registered model to delete
model_name = "wine_quality"

We can specify list of version and loop through each to move stage to Archived and then call delete registered model
to delete the registered models
client.transition_model_version_stage(
  name=model_name,
  version=6,
  stage="Archived",
)

# Delete the registered models
client = MlflowClient()

client.delete_registered_model(name=model_name)

"""



In [0]:
"""
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env


class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
  def __init__(self, model):
    self.model = model
    
  def predict(self, context, model_input):
    return self.model.predict_proba(model_input)[:,1]
	
mlflow.set_experiment("/Users/ganendra.vadlamu@tigeranalytics.com/wine_quality")

current_run = mlflow.active_run()

if current_run is not None:
    # Get the current run ID
    run_id = current_run.info.run_id

    # Get the previous run's name
    previous_run_name = mlflow.get_run(run_id).data.tags.get("mlflow.runName")

    # Increment the run name
    if previous_run_name:
        # Extract the run number from the previous run's name
        previous_run_number = int(previous_run_name.split("_")[-1])

        # Increment the run number
        new_run_number = previous_run_number + 1

        # Create the new run name
        new_run_name = f"random_forest_untuned_run_{new_run_number}"
    else:
        # If there is no previous run name, start with Run_1
        new_run_name = "random_forest_untuned_run_1"

    # Set the new run name
    mlflow.set_tag("mlflow.runName", new_run_name)

    run_name=new_run_name
else:
    # Start a new MLflow run if there is no active run
    run_name="random_forest_untuned_run_1"


# mlflow.start_run creates a new MLflow run to track the performance of this model. 
# Within the context, you call mlflow.log_param to keep track of the parameters used, and
# mlflow.log_metric to record metrics like accuracy.
with mlflow.start_run(run_name=run_name):
  n_estimators = 10
  model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))
  model.fit(X_train, y_train)

  # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]
  predictions_test = model.predict_proba(X_test)[:,1]
  auc_score = roc_auc_score(y_test, predictions_test)
  mlflow.log_param('n_estimators', n_estimators)
  # Use the area under the ROC curve as a metric.
  mlflow.log_metric('auc', auc_score)
  wrappedModel = SklearnModelWrapper(model)
  # Log the model with a signature that defines the schema of the model's inputs and outputs. 
  # When the model is deployed, this signature will be used to validate inputs.
  signature = infer_signature(X_train, wrappedModel.predict(None, X_train))
  
  # MLflow contains utilities to create a conda environment used to serve models.
  # The necessary dependencies are added to a conda.yaml file which is logged along with the model.
  conda_env =  _mlflow_conda_env(
        additional_conda_deps=None,
        additional_pip_deps=["cloudpickle=={}".format(cloudpickle.__version__), "scikit-learn=={}".format(sklearn.__version__)],
        additional_conda_channels=None,
    )
  mlflow.pyfunc.log_model("random_forest_model", python_model=wrappedModel, conda_env=conda_env, signature=signature)

  feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)
#model registry
run_id = mlflow.search_runs(filter_string='tags.mlflow.runName LIKE  "random_forest_untuned_run%"').iloc[0].run_id

# If you see the error "PERMISSION_DENIED: User does not have any permission level assigned to the registered model", 
# the cause may be that a model already exists with the name "wine_quality". Try using a different name.
model_name = "wine_quality"
model_version = mlflow.register_model(f"runs:/{run_id}/random_forest_model", model_name)

# Registering the model takes a few seconds, so add a small delay
time.sleep(15)

#registering the model to production
client = MlflowClient()
client.transition_model_version_stage(
  name=model_name,
  version=model_version.version,
  stage="Production",
)

model = mlflow.pyfunc.load_model(f"models:/{model_name}/production")

# Sanity-check: This should match the AUC logged by MLflow
print(f'AUC: {roc_auc_score(y_test, model.predict(X_test))}')

# To simulate a new corpus of data, save the existing X_train data to a Delta table. 
# In the real world, this would be a new batch of data.
spark_df = spark.createDataFrame(X_train)
# Replace <username> with your username before running this cell.
table_path = "dbfs:/ganendra.vadlamu@tigeranalytics.com/delta/wine_data"
# Delete the contents of this path in case this cell has already been run
dbutils.fs.rm(table_path, True)
spark_df.write.format("delta").save(table_path)"""

