## 02-experiment-tracking

#### The goal of this homework is to get familiar with MLflow, the tool for experiment tracking and model management.

In [1]:
#import required dependency

import os
import pickle
import click
import pandas as pd


import mlflow
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

#### Q1. Install MLflow

In [2]:
mlflow.__version__

'2.22.0'

In [3]:
#### Q2. Download and preprocess the data

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [4]:
mlflow.set_experiment("New York City Taxi Experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1748319704231, experiment_id='1', last_update_time=1748319704231, lifecycle_stage='active', name='New York City Taxi Experiment', tags={}>

In [6]:
!wget "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet" -P TAXI_DATA_FOLDER

--2025-05-27 05:52:16--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.112, 3.164.82.160, 3.164.82.197, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-01.parquet’


2025-05-27 05:52:18 (24.6 MB/s) - ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-01.parquet’ saved [47673370/47673370]



In [7]:
!wget "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet" -P TAXI_DATA_FOLDER

--2025-05-27 05:52:19--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.197, 3.164.82.112, 3.164.82.40, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application/x-www-form-urlencoded]
Saving to: ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-02.parquet’


2025-05-27 05:52:21 (23.4 MB/s) - ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-02.parquet’ saved [47748012/47748012]



In [8]:
!wget "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet" -P TAXI_DATA_FOLDER

--2025-05-27 05:52:21--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.160, 3.164.82.40, 3.164.82.197, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.160|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56127762 (54M) [binary/octet-stream]
Saving to: ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-03.parquet’


2025-05-27 05:52:24 (25.7 MB/s) - ‘TAXI_DATA_FOLDER/yellow_tripdata_2023-03.parquet’ saved [56127762/56127762]



In [5]:
import os
os.listdir("TAXI_DATA_FOLDER")

['.ipynb_checkpoints',
 'yellow_tripdata_2023-01.parquet',
 'yellow_tripdata_2023-02.parquet',
 'yellow_tripdata_2023-03.parquet']

In [6]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    # Handle both yellow and green taxi schema
    pickup_col = 'tpep_pickup_datetime' if 'tpep_pickup_datetime' in df.columns else 'lpep_pickup_datetime'
    dropoff_col = 'tpep_dropoff_datetime' if 'tpep_dropoff_datetime' in df.columns else 'lpep_dropoff_datetime'

    df['duration'] = df[dropoff_col] - df[pickup_col]
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [7]:
raw_data_path = "TAXI_DATA_FOLDER"
dataset="yellow"
# df_train = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet"))
df_val = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet"))
df_test = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet"))

In [16]:
df_train = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet"))

In [8]:
import gc
gc.collect()

0

In [17]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

In [10]:
import gc
gc.collect()

0

In [11]:
import os
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        pickle.dump(obj, f_out)

In [12]:
dest_path = "output"
os.makedirs(dest_path, exist_ok=True)

In [13]:
dv = DictVectorizer()

In [None]:
dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))

In [14]:
import gc
gc.collect()

0

In [18]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(dicts)

In [19]:
import gc
gc.collect()

0

In [17]:
dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))

In [None]:
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(dicts)

In [None]:
dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))

In [None]:
df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(dicts)

In [None]:
dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

In [None]:
%load_ext autoreload
%autoreload 2
import gc
gc.collect()
from preprocess_data import run_data_prep

run_data_prep("./TAXI_DATA_FOLDER", "./output", dataset="yellow")


In [None]:
import os

print("Files in output folder:", os.listdir("./output"))

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "cristian")
    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")