Dataset used: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [2]:
# mlflow setup

import mlflow

mlflow.set_experiment("w2-nyc-taxi-experiment")

mlflow.set_tracking_uri(uri="sqlite:///mlflow.db")


In [5]:
import pandas as pd
import numpy as np

# yellow_taxi = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
yellow_taxi = pd.read_parquet("./data/yellow-taxi-2301.parquet")

def duration_in_minutes(df):
    duration = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    return duration / pd.Timedelta(minutes=1)

def prepare_data(df, samp: bool=False):
    df['duration'] = duration_in_minutes(df)
    filtered_df = df[ (df.duration >= 1) & (df.duration <= 60) ].copy()
    
    #sampling 
    if samp:
        return filtered_df.sample(frac=0.05, random_state=42)
    else:
        return filtered_df
    

# sampling
df = prepare_data(yellow_taxi, samp = True)

In [6]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# make sure 'DOLocationID' is str type, otherwise the dictvectorizer
# will treat it as numeric, hence not one-hot encoding it

def prepare_dict(df):
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype(str)

    # turn to dict
    return df[categorical + numerical].to_dict(orient='records')

train_dicts = prepare_dict(df)

from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()

# feature matrix array:
X_train = vec.fit_transform(train_dicts)
y_train = df.duration.values

X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 451377 stored elements and shape (150459, 479)>

In [7]:
# Validation set
# yellow_taxi_2 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
yellow_taxi_2 = pd.read_parquet("./data/yellow-taxi-2302.parquet")

# sampling
df2 = prepare_data(yellow_taxi_2, samp = True)

X_val = vec.transform(prepare_dict(df2))
y_val = df2.duration.values

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

reg = linear_model.LinearRegression()

reg.fit(X_train, y_train)

y_pred = reg.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

64.79780199657012

# Use Lasso instead

In [8]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

from mlflow.models import infer_signature

params = {
    'train-data-path': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
    'val-data-path': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
    'alpha': 0.1,
}

with mlflow.start_run():

    mlflow.log_params(params)

    lasso = linear_model.Lasso(alpha=params['alpha'])
    lasso.fit(X_train, y_train)

    y_pred = lasso.predict(X_val)

    mse_lasso = mean_squared_error(y_val, y_pred)
    
    mlflow.log_metrics({
        'mse': mse_lasso,
    })
    
    mlflow.set_tag("Training Info", "Demonstrate Mlflow tracking.")
    
    # signature = infer_signature(X_train, lasso.predict(X_train))
    
    # # Log the model
    # model_info = mlflow.sklearn.log_model(
    #     sk_model=mse_lasso,
    #     artifact_path="lr-lasso",
    #     signature=signature,
    #     input_example=X_train,
    #     registered_model_name="lasso-duration",
    # )

In [7]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

params = {
    'train-data-path': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
    'val-data-path': 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
    'alpha': 0.1,
}

lasso = linear_model.Lasso(alpha=params['alpha'])
lasso.fit(X_train, y_train)

In [8]:
lasso.predict(X_val)

array([13.24620498, 21.81315923, 30.08160668, ..., 12.91778725,
       12.86755557, 12.59914871])

# Mlflow logging

In [None]:
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # Infer the model signature
    signature = infer_signature(X_train, lr.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=lr,
        artifact_path="iris_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )