In [1]:
%load_ext autoreload
%autoreload 2

# MLflow Regression Pipeline Notebook

This notebook runs the MLflow Regression Pipeline on Databricks and inspects its results. For more information about the MLflow Regression Pipeline, including usage examples, see the [Regression Pipeline overview documentation](https://mlflow.org/docs/latest/pipelines.html#regression-pipeline) the [Regression Pipeline API documentation](https://mlflow.org/docs/latest/python_api/mlflow.pipelines.html#module-mlflow.pipelines.regression.v1.pipeline).

In [2]:
from mlflow.pipelines import Pipeline

p = Pipeline(profile="local")

2022/08/12 09:27:49 INFO mlflow.pipelines.pipeline: Creating MLflow Pipeline 'mlp-regression-template' with profile: 'local'


In [3]:
p.clean()

In [4]:
p.inspect()

In [None]:
p.run("ingest")

In [9]:
p.run("split")

2022/08/12 09:43:27 INFO mlflow.pipelines.steps.ingest.datasets: Resolving input data from '['/Users/glenn/code/mlp-regression-template/data/sample.parquet']'
2022/08/12 09:43:27 INFO mlflow.pipelines.steps.ingest.datasets: Resolved input data to '/private/var/folders/s5/c5lc_x31043b1t6556x7kglr0000gn/T/tmpxoj24p05/sample.parquet'
2022/08/12 09:43:27 INFO mlflow.pipelines.steps.ingest.datasets: Converting dataset to parquet format, if necessary
2022/08/12 09:43:28 INFO mlflow.pipelines.steps.ingest: Successfully stored data in parquet format at '/Users/glenn/.mlflow/pipelines/fa9f28a75fdc3501a54a0081aecc4d0cc48e31de8f055635698cfa016075fac0/steps/ingest/outputs/dataset.parquet'
2022/08/12 09:43:28 INFO mlflow.pipelines.steps.ingest: Profiling ingested dataset
2022/08/12 09:43:32 INFO mlflow.pipelines.steps.ingest: Wrote dataset profile to '/Users/glenn/.mlflow/pipelines/fa9f28a75fdc3501a54a0081aecc4d0cc48e31de8f055635698cfa016075fac0/steps/ingest/outputs/dataset_profile.html'
2022/08/12

In [11]:
p.run("transform")

Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32
pickup_dow,int64
pickup_hour,int64
trip_duration,float64

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64


In [12]:
p.run("train")

2022/08/12 09:45:10 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2022/08/12 09:45:10 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

Metric,training,validation
root_mean_squared_error,1.86182,7.70958
example_count,8019.0,955.0
max_error,50.2218,223.0
mean_absolute_error,0.946436,1.27688
mean_absolute_percentage_error,0.0883914,0.0899405
mean_on_label,12.3563,13.0743
mean_squared_error,3.46638,59.4377
r2_score,0.967695,0.487596
score,0.967695,0.487596
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer
pickup_dow,long
pickup_hour,long
trip_duration,double

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
50.221821,4.778179,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027,6,4,1.85
36.316667,48.683333,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114,3,17,46.066667
35.821918,16.178082,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367,4,16,11.816667
33.070796,18.929204,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199,1,9,38.283333
31.586294,20.413706,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012,5,0,14.766667
31.472222,56.527778,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710,3,12,29.783333
31.032374,13.967626,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302,5,23,17.133333
21.972222,56.527778,78.5,2016-01-22 10:09:42,2016-01-22 11:04:16,19.8,10028,7114,4,10,54.566667
21.972222,56.527778,78.5,2016-01-11 17:21:51,2016-01-11 18:17:18,19.3,10021,7114,0,17,55.45
20.062992,51.937008,72.0,2016-02-04 15:12:55,2016-02-04 15:58:36,17.57,10111,7114,3,15,45.683333

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,7.70958,7.70958
weighted_mean_squared_error,9.39079,9.39079
max_error,223,223
mean_absolute_error,1.27688,1.27688
mean_absolute_percentage_error,0.0899405,0.0899405
mean_squared_error,59.4377,59.4377
Run Time,2022-08-12 09:45:17,2022-08-12 09:45:17
Run ID,4f078375d4194cf6809287b4c4a78b07,4f078375d4194cf6809287b4c4a78b07


In [13]:
p.run("evaluate")

2022/08/12 09:45:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2022/08/12 09:45:59 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


Metric,validation,test
root_mean_squared_error,7.70958,1.790105
example_count,955.0,987.0
max_error,223.0,37.0
mean_absolute_error,1.27688,0.943789
mean_absolute_percentage_error,0.0899405,0.583893
mean_on_label,13.0743,12.180355
mean_squared_error,59.4377,3.204478
r2_score,0.487596,0.966819
score,0.487596,0.966819
sum_on_label,12486.0,12022.01

metric,greater_is_better,value,threshold,validated
root_mean_squared_error,False,1.79011,10,✅
mean_absolute_error,False,0.943789,50,✅
weighted_mean_squared_error,False,1.37047,20,✅


In [14]:
p.run("register")

Successfully registered model 'taxi_fare_regressor'.
2022/08/12 09:46:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: taxi_fare_regressor, version 1
Created version '1' of model 'taxi_fare_regressor'.


In [15]:
p.inspect("train")

Metric,training,validation
root_mean_squared_error,1.86182,7.70958
example_count,8019.0,955.0
max_error,50.2218,223.0
mean_absolute_error,0.946436,1.27688
mean_absolute_percentage_error,0.0883914,0.0899405
mean_on_label,12.3563,13.0743
mean_squared_error,3.46638,59.4377
r2_score,0.967695,0.487596
score,0.967695,0.487596
sum_on_label,99085.0,12486.0

Name,Type
tpep_pickup_datetime,datetime
tpep_dropoff_datetime,datetime
trip_distance,double
pickup_zip,integer
dropoff_zip,integer
pickup_dow,long
pickup_hour,long
trip_duration,double

Name,Type
-,"Tensor('float64', (-1,))"

absolute_error,prediction,fare_amount,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
50.221821,4.778179,55.0,2016-02-28 04:50:41,2016-02-28 04:52:32,0.18,10115,10027,6,4,1.85
36.316667,48.683333,85.0,2016-02-11 17:52:13,2016-02-11 18:38:17,14.46,10282,7114,3,17,46.066667
35.821918,16.178082,52.0,2016-02-26 16:54:41,2016-02-26 17:06:30,4.02,11371,11367,4,16,11.816667
33.070796,18.929204,52.0,2016-01-26 09:04:58,2016-01-26 09:43:15,3.0,11109,10199,1,9,38.283333
31.586294,20.413706,52.0,2016-01-16 00:12:20,2016-01-16 00:27:06,6.1,11378,10012,5,0,14.766667
31.472222,56.527778,88.0,2016-02-11 12:47:12,2016-02-11 13:16:59,19.02,10119,10710,3,12,29.783333
31.032374,13.967626,45.0,2016-02-06 23:18:10,2016-02-06 23:35:18,3.51,10013,7302,5,23,17.133333
21.972222,56.527778,78.5,2016-01-22 10:09:42,2016-01-22 11:04:16,19.8,10028,7114,4,10,54.566667
21.972222,56.527778,78.5,2016-01-11 17:21:51,2016-01-11 18:17:18,19.3,10021,7114,0,17,55.45
20.062992,51.937008,72.0,2016-02-04 15:12:55,2016-02-04 15:58:36,17.57,10111,7114,3,15,45.683333

Unnamed: 0,Latest,Best
Model Rank,1,1
root_mean_squared_error,7.70958,7.70958
weighted_mean_squared_error,9.39079,9.39079
max_error,223,223
mean_absolute_error,1.27688,1.27688
mean_absolute_percentage_error,0.0899405,0.0899405
mean_squared_error,59.4377,59.4377
Run Time,2022-08-12 09:45:17,2022-08-12 09:45:17
Run ID,4f078375d4194cf6809287b4c4a78b07,4f078375d4194cf6809287b4c4a78b07


In [16]:
test_data = p.get_artifact("test_data")
test_data.describe()

Unnamed: 0,trip_distance,fare_amount,pickup_zip,dropoff_zip,pickup_dow,pickup_hour,trip_duration
count,987.0,987.0,987.0,987.0,987.0,987.0,987.0
mean,2.84463,12.180355,10130.107396,10171.005066,3.180344,13.680851,13.646707
std,3.479708,9.832273,317.412339,376.237618,1.97651,6.353072,19.105446
min,0.03,0.01,10001.0,7423.0,0.0,0.0,0.2
25%,1.0,6.5,10012.0,10013.0,1.0,9.0,6.441667
50%,1.68,9.0,10022.0,10023.0,3.0,14.0,10.283333
75%,2.995,14.0,10110.0,10119.0,5.0,19.0,16.775
max,24.49,75.5,11436.0,11435.0,6.0,23.0,517.0


In [17]:
trained_model = p.get_artifact("model")
print(trained_model)

mlflow.pyfunc.loaded_model:
  artifact_path: train/model
  flavor: mlflow.sklearn
  run_id: 4f078375d4194cf6809287b4c4a78b07

