In [1]:
%load_ext autoreload
%autoreload 2

# MLflow Regression Pipeline Notebook

This notebook runs the MLflow Regression Pipeline on Databricks and inspects its results. For more information about the MLflow Regression Pipeline, including usage examples, see the [Regression Pipeline overview documentation](https://mlflow.org/docs/latest/pipelines.html#regression-pipeline) the [Regression Pipeline API documentation](https://mlflow.org/docs/latest/python_api/mlflow.pipelines.html#module-mlflow.pipelines.regression.v1.pipeline).

In [3]:
from mlflow.pipelines import Pipeline

p = Pipeline(profile="domino")

2022/08/17 13:40:10 INFO mlflow.pipelines.pipeline: Creating MLflow Pipeline 'code' with profile: 'domino'


In [4]:
p.clean()

In [5]:
p.inspect()

In [6]:
p.run("ingest")

2022/08/17 13:40:18 INFO mlflow.pipelines.steps.ingest.datasets: Resolving input data from '['/mnt/code/data/sample.parquet']'
2022/08/17 13:40:18 INFO mlflow.pipelines.steps.ingest.datasets: Resolved input data to '/tmp/tmpag3fle1h/sample.parquet'
2022/08/17 13:40:18 INFO mlflow.pipelines.steps.ingest.datasets: Converting dataset to parquet format, if necessary
2022/08/17 13:40:18 INFO mlflow.pipelines.steps.ingest: Successfully stored data in parquet format at '/home/ubuntu/.mlflow/pipelines/a470467330da5f46201f08d8a54e26788963462ee2548405bb1e813771386f5d/steps/ingest/outputs/dataset.parquet'
2022/08/17 13:40:18 INFO mlflow.pipelines.steps.ingest: Profiling ingested dataset
2022/08/17 13:40:25 INFO mlflow.pipelines.steps.ingest: Wrote dataset profile to '/home/ubuntu/.mlflow/pipelines/a470467330da5f46201f08d8a54e26788963462ee2548405bb1e813771386f5d/steps/ingest/outputs/dataset_profile.html'


name,type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,number
fare_amount,number
pickup_zip,integer
dropoff_zip,integer


In [7]:
p.run("split")

2022/08/17 13:40:27 INFO mlflow.pipelines.steps.split: Creating hash buckets on input dataset containing 10000 rows consumes 0.094940185546875 seconds.
2022/08/17 13:40:27 INFO mlflow.pipelines.steps.split: Split dataset result: train split (8051 rows), validation split (959 rows), test split (990 rows).


In [8]:
p.run("transform")

Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32
pickup_dow,int64
pickup_hour,int64
trip_duration,float64

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64


In [9]:
p.run("train")

2022/08/17 13:42:58 INFO mlflow.pipelines.utils.tracking: Experiment with name 'sklearn_regression_experiment' does not exist. Creating a new experiment.
2022/08/17 13:42:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  inputs = _infer_schema(model_input)
2022/08/17 13:43:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/08/17 13:43:11 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2022/08/17 13:43:12 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/08/17 13:43:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


Metric,training,validation
root_mean_squared_error,3.43472,3.25884
example_count,8019.0,955.0
max_error,212.188,53.1241
mean_absolute_error,1.5208,1.63894
mean_absolute_percentage_error,0.148791,0.146808
mean_on_label,12.3563,13.0743
mean_squared_error,11.7973,10.62
r2_score,0.890054,0.908446
score,0.890054,0.908446
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer
pickup_dow,long
pickup_hour,long
trip_duration,double

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
212.188444,62.811556,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008,4,20,57.316667
51.38178,3.61822,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027,6,4,1.85
39.557483,45.442517,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114,3,17,46.066667
38.458072,13.541928,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199,1,9,38.283333
36.095139,15.904861,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367,4,16,11.816667
31.547252,13.452748,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302,5,23,17.133333
31.402555,20.597445,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012,5,0,14.766667
29.897227,58.102773,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710,3,12,29.783333
28.73453,31.23453,2.5,2016-01-16 17:50:50,2016-01-16 17:51:24,9.6,10007,10007,5,17,0.566667
22.171355,24.671355,2.5,2016-01-04 10:20:18,2016-01-04 11:20:43,7.2,11370,11205,0,10,60.416667

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,3.25884,3.25884
weighted_mean_squared_error,8.18055,8.18055
max_error,53.1241,53.1241
mean_absolute_error,1.63894,1.63894
mean_absolute_percentage_error,0.146808,0.146808
mean_squared_error,10.62,10.62
Run Time,2022-08-17 13:42:59,2022-08-17 13:42:59
Run ID,1c678d8a6fb7463486776624f4dd2137,1c678d8a6fb7463486776624f4dd2137


In [13]:
p.run("evaluate")

2022/08/17 14:03:47 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
X does not have valid feature names, but SGDRegressor was fitted with feature names
2022/08/17 14:03:49 INFO mlflow.models.evaluation.default_evaluator: Shap explainer _PatchedKernelExplainer is used.

  0%|          | 0/10 [00:00<?, ?it/s]X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names

 10%|█         | 1/10 [00:00<00:01,  6.70it/s]X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names

 20%|██        | 2/10 [00:00<00:01,  4.27it/s]X does not have valid feature names, but SGDRegressor was fitted with feature names
X does not have valid feature names, but SGDRegressor was fitted with feature names

 30%|███       | 3/10 [00:00<00:01,  3.78it/s]X does not

Metric,validation,test
root_mean_squared_error,3.25884,2.248191
example_count,955.0,987.0
max_error,53.1241,17.95838
mean_absolute_error,1.63894,1.513182
mean_absolute_percentage_error,0.146808,0.59546
mean_on_label,13.0743,12.180355
mean_squared_error,10.62,5.054361
r2_score,0.908446,0.947664
score,0.908446,0.947664
sum_on_label,12486.0,12022.01

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,False,2.24819,10,✅
mean_absolute_error,False,1.51318,50,✅
weighted_mean_squared_error,False,3.29751,20,✅


In [14]:
p.run("register")

Registered model 'taxi_fare_regressor' already exists. Creating a new version of this model...
2022/08/17 14:04:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: taxi_fare_regressor, version 1
Created version '1' of model 'taxi_fare_regressor'.


In [19]:
p.inspect("train")

Metric,training,validation
root_mean_squared_error,3.43472,3.25884
example_count,8019.0,955.0
max_error,212.188,53.1241
mean_absolute_error,1.5208,1.63894
mean_absolute_percentage_error,0.148791,0.146808
mean_on_label,12.3563,13.0743
mean_squared_error,11.7973,10.62
r2_score,0.890054,0.908446
score,0.890054,0.908446
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer
pickup_dow,long
pickup_hour,long
trip_duration,double

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
212.188444,62.811556,275.0,2016-02-12 20:55:19,2016-02-12 21:52:38,20.85,10013,7008,4,20,57.316667
51.38178,3.61822,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027,6,4,1.85
39.557483,45.442517,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114,3,17,46.066667
38.458072,13.541928,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199,1,9,38.283333
36.095139,15.904861,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367,4,16,11.816667
31.547252,13.452748,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302,5,23,17.133333
31.402555,20.597445,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012,5,0,14.766667
29.897227,58.102773,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710,3,12,29.783333
28.73453,31.23453,2.5,2016-01-16 17:50:50,2016-01-16 17:51:24,9.6,10007,10007,5,17,0.566667
22.171355,24.671355,2.5,2016-01-04 10:20:18,2016-01-04 11:20:43,7.2,11370,11205,0,10,60.416667

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,3.25884,3.25884
weighted_mean_squared_error,8.18055,8.18055
max_error,53.1241,53.1241
mean_absolute_error,1.63894,1.63894
mean_absolute_percentage_error,0.146808,0.146808
mean_squared_error,10.62,10.62
Run Time,2022-08-17 13:42:59,2022-08-17 13:42:59
Run ID,1c678d8a6fb7463486776624f4dd2137,1c678d8a6fb7463486776624f4dd2137


In [20]:
test_data = p.get_artifact("test_data")
test_data.describe()

Unnamed: 0,trip_distance,fare_amount,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
count,987.0,987.0,987.0,987.0,987.0,987.0,987.0
mean,2.84463,12.180355,10130.107396,10171.005066,3.180344,13.680851,13.646707
std,3.479708,9.832273,317.412339,376.237618,1.97651,6.353072,19.105446
min,0.03,0.01,10001.0,7423.0,0.0,0.0,0.2
25%,1.0,6.5,10012.0,10013.0,1.0,9.0,6.441667
50%,1.68,9.0,10022.0,10023.0,3.0,14.0,10.283333
75%,2.995,14.0,10110.0,10119.0,5.0,19.0,16.775
max,24.49,75.5,11436.0,11435.0,6.0,23.0,517.0


In [22]:
trained_model = p.get_artifact("model")
print(trained_model)

mlflow.pyfunc.loaded_model:
  artifact_path: train/model
  flavor: mlflow.sklearn
  run_id: 1c678d8a6fb7463486776624f4dd2137

