In [None]:
%load_ext autoreload
%autoreload 2

# MLflow Regression Pipeline Notebook

This notebook runs the MLflow Regression Pipeline on Databricks and inspects its results. For more information about the MLflow Regression Pipeline, including usage examples, see the [Regression Pipeline overview documentation](https://mlflow.org/docs/latest/pipelines.html#regression-pipeline) the [Regression Pipeline API documentation](https://mlflow.org/docs/latest/python_api/mlflow.pipelines.html#module-mlflow.pipelines.regression.v1.pipeline).

In [12]:
from mlflow.pipelines import Pipeline

p = Pipeline(profile="local")

2022/09/11 15:10:24 INFO mlflow.pipelines.pipeline: Creating MLflow Pipeline 'sklearn_regression' with profile: 'local'


In [None]:
p.clean()

In [None]:
p.inspect()

In [None]:
p.ingest(location="./data/sample.parquet", format="parquet")

In [13]:
train_df, val_df, test_df = p.split([0.1, 0.1, 0.8])

In [24]:
%%mlp_code steps/transform.py

"""
This module defines the following routines used by the 'transform' step of the regression pipeline:

- ``transformer_fn``: Defines customizable logic for transforming input data before it is passed
  to the estimator during model inference.
"""

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline

from pandas import DataFrame
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def calculate_features(df: DataFrame):
    """
    Extend the input dataframe with pickup day of week and hour, and trip duration.
    Drop the now-unneeded pickup datetime and dropoff datetime columns.
    """
    df["pickup_dow"] = df["tpep_pickup_datetime"].dt.dayofweek
    df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
    trip_duration = (
            df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    )
    df["trip_duration"] = trip_duration.map(lambda x: x.total_seconds() / 60)
    df.drop(columns=["tpep_pickup_datetime", "tpep_dropoff_datetime"], inplace=True)
    return df


def transformer_fn():
    """
    Returns an *unfitted* transformer that defines ``fit()`` and ``transform()`` methods.
    The transformer's input and output signatures should be compatible with scikit-learn
    transformers.
    """
    return SkPipeline(
        steps=[
            (
                "calculate_time_and_duration_features",
                FunctionTransformer(calculate_features, feature_names_out="one-to-one"),
            ),
            (
                "encoder",
                ColumnTransformer(
                    transformers=[
                        (
                            "hour_encoder",
                            OneHotEncoder(categories="auto", sparse=False),
                            ["pickup_hour"],
                        ),
                        (
                            "day_encoder",
                            OneHotEncoder(categories="auto", sparse=False),
                            ["pickup_dow"],
                        ),
                        (
                            "std_scaler",
                            StandardScaler(),
                            ["trip_distance", "trip_duration"],
                        ),
                    ]
                ),
            ),
        ]
    )

In [25]:
p.transform("steps.transform.transformer_fn")

Name,Type
tpep_pickup_datetime,datetime64[ns]
tpep_dropoff_datetime,datetime64[ns]
trip_distance,float64
fare_amount,float64
pickup_zip,int32
dropoff_zip,int32

Name,Type
hour_encoder__pickup_hour_0,float64
hour_encoder__pickup_hour_1,float64
hour_encoder__pickup_hour_2,float64
hour_encoder__pickup_hour_3,float64
hour_encoder__pickup_hour_4,float64
hour_encoder__pickup_hour_5,float64
hour_encoder__pickup_hour_6,float64
hour_encoder__pickup_hour_7,float64
hour_encoder__pickup_hour_8,float64
hour_encoder__pickup_hour_9,float64


In [None]:
p.run("train")

In [None]:
p.run("evaluate")

In [None]:
p.run("register")

In [None]:
p.inspect("train")

In [None]:
test_data = p.get_artifact("test_data")
test_data.describe()

In [None]:
trained_model = p.get_artifact("model")
print(trained_model)