In [0]:
%sh
rm -r /dbfs/mlflow_lab
mkdir /dbfs/mlflow_lab
wget -O /dbfs/mlflow_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

rm: cannot remove '/dbfs/mlflow_lab': No such file or directory
--2025-01-23 11:57:49--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘/dbfs/mlflow_lab/penguins.csv’

     0K .........                                             100%  552K=0.02s

2025-01-23 11:57:49 (552 KB/s) - ‘/dbfs/mlflow_lab/penguins.csv’ saved [9533/9533]



prepare the data

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = spark.read.format("csv").option("header", "true").load("/mlflow_lab/penguins.csv")
data = data.dropna().select(col("Island").astype("string"),
                            col("CulmenLength").astype("float"),
                            col("CulmenDepth").astype("float"),
                            col("FlipperLength").astype("float"),
                            col("BodyMass").astype("float"),
                            col("Species").astype("int")
                          )
display(data.sample(0.2))
   
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,42.0,20.2,190.0,4250.0,0
Torgersen,34.6,21.1,198.0,4400.0,0
Torgersen,38.7,19.0,195.0,3450.0,0
Dream,42.2,18.5,180.0,3550.0,0
Dream,37.0,16.9,185.0,3000.0,0
Dream,41.1,19.0,182.0,3425.0,0
Dream,37.5,18.9,179.0,2975.0,0
Dream,42.3,21.2,191.0,4150.0,0


Training Rows: 245  Testing Rows: 97


### Run an MLflow experiment

You can use the same libraries and techniques you normally use to train and evaluate a model (in this case, we’ll use the Spark MLLib library), but do so within the context of an MLflow experiment that includes additional commands to log important metrics and information during the process.


In [0]:
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time
   
# Start an MLflow run
with mlflow.start_run():
    catFeature = "Island"
    numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
     
    # parameters
    maxIterations = 5
    regularization = 0.5
   
    # Define the feature engineering and model steps
    catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
    numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
    numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
    featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
    algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=maxIterations, regParam=regularization)
   
    # Chain the steps as stages in a pipeline
    pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
    # Log training parameter values
    print ("Training Logistic Regression model...")
    mlflow.log_param('maxIter', algo.getMaxIter())
    mlflow.log_param('regParam', algo.getRegParam())
    model = pipeline.fit(train)
      
    # Evaluate the model and log metrics
    prediction = model.transform(test)
    metrics = ["accuracy", "weightedRecall", "weightedPrecision"]
    for metric in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction", metricName=metric)
        metricValue = evaluator.evaluate(prediction)
        print("%s: %s" % (metric, metricValue))
        mlflow.log_metric(metric, metricValue)
   
           
    # Log the model itself
    unique_model_name = "classifier-" + str(time.time())
    mlflow.spark.log_model(model, unique_model_name, mlflow.spark.get_default_conda_env())
    modelpath = "/model/%s" % (unique_model_name)
    mlflow.spark.save_model(model, modelpath)
       
    print("Experiment run complete.")

  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run
  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run
  """The sequence number of this run attempt for a triggered job run. The initial attempt of a run


Training Logistic Regression model...
accuracy: 0.8350515463917526
weightedRecall: 0.8350515463917525
weightedPrecision: 0.8813528667028395


Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

2025/01/23 11:59:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run whimsical-grouse-485 at: adb-3941174412663662.2.azuredatabricks.net/ml/experiments/887869904992919/runs/541d7443d1d440109b9b40b7729a4fb3.
2025/01/23 11:59:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: adb-3941174412663662.2.azuredatabricks.net/ml/experiments/887869904992919.


Experiment run complete.


* Create a function

In machine learning projects, data scientists often try training models with different parameters, logging the results each time. To accomplish that, it’s common to create a function that encapsulates the training process and call it with the parameters you want to try.

Create a function based on the training code you used previously

In [0]:
def train_penguin_model(training_data, test_data, maxIterations, regularization):
    import mlflow
    import mlflow.spark
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    import time
   
    # Start an MLflow run
    with mlflow.start_run():
   
        catFeature = "Island"
        numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
   
        # Define the feature engineering and model steps
        catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
        numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
        numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
        featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
        algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=maxIterations, regParam=regularization)
   
        # Chain the steps as stages in a pipeline
        pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
        # Log training parameter values
        print ("Training Logistic Regression model...")
        mlflow.log_param('maxIter', algo.getMaxIter())
        mlflow.log_param('regParam', algo.getRegParam())
        model = pipeline.fit(training_data)
   
        # Evaluate the model and log metrics
        prediction = model.transform(test_data)
        metrics = ["accuracy", "weightedRecall", "weightedPrecision"]
        for metric in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction", metricName=metric)
            metricValue = evaluator.evaluate(prediction)
            print("%s: %s" % (metric, metricValue))
            mlflow.log_metric(metric, metricValue)
   
   
        # Log the model itself
        unique_model_name = "classifier-" + str(time.time())
        mlflow.spark.log_model(model, unique_model_name, mlflow.spark.get_default_conda_env())
        modelpath = "/model/%s" % (unique_model_name)
        mlflow.spark.save_model(model, modelpath)
   
        print("Experiment run complete.")

call it

In [0]:
train_penguin_model(train, test, 10, 0.2)

Training Logistic Regression model...
accuracy: 0.8865979381443299
weightedRecall: 0.8865979381443299
weightedPrecision: 0.9105868358445677


Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

2025/01/23 12:08:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run lyrical-dove-351 at: adb-3941174412663662.2.azuredatabricks.net/ml/experiments/887869904992919/runs/0ebc86ef3a6f458183263c2e332b9964.
2025/01/23 12:08:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: adb-3941174412663662.2.azuredatabricks.net/ml/experiments/887869904992919.


Experiment run complete.
