In [1]:
!python -V

Python 3.13.7


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/18 15:21:51 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/18 15:21:51 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/18 15:21:51 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='/Users/dmytro_khodyriev/mlops/02-experiments-tracking/mlruns/1', creation_time=1771418929655, experiment_id='1', last_update_time=1771418929655, lifecycle_stage='active', name='nyc-taxi-experiment', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

In [9]:
len(df_train), len(df_val)

(73908, 61921)

In [10]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [11]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.758715203343134

In [14]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [15]:
with mlflow.start_run():

    mlflow.set_tag("developer", "cristian")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")

In [16]:
import xgboost as xgb

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [18]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [20]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:9.13007                           
[1]	validation-rmse:7.70912                           
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:7.09163                           
[3]	validation-rmse:6.83387                           
[4]	validation-rmse:6.71288                           
[5]	validation-rmse:6.65135                           
[6]	validation-rmse:6.61885                           
[7]	validation-rmse:6.59900                           
[8]	validation-rmse:6.58870                           
[9]	validation-rmse:6.58444                           
[10]	validation-rmse:6.57652                          
[11]	validation-rmse:6.57029                          
[12]	validation-rmse:6.56719                          
[13]	validation-rmse:6.56294                          
[14]	validation-rmse:6.55904                          
[15]	validation-rmse:6.55486                          
[16]	validation-rmse:6.55113                          
[17]	validation-rmse:6.54898                          
[18]	validation-rmse:6.54622                          
[19]	validation-rmse:6.54352                          
[20]	valid

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.08744                                                   
[1]	validation-rmse:10.16257                                                   
[2]	validation-rmse:9.41181                                                    
[3]	validation-rmse:8.80256                                                    
[4]	validation-rmse:8.31483                                                    
[5]	validation-rmse:7.92405                                                    
[6]	validation-rmse:7.61771                                                    
[7]	validation-rmse:7.37561                                                    
[8]	validation-rmse:7.18268                                                    
[9]	validation-rmse:7.03099                                                    
[10]	validation-rmse:6.91226                                                   
[11]	validation-rmse:6.81913                                                   
[12]	validation-rmse:6.74656            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.34864                                                   
[1]	validation-rmse:10.60350                                                   
[2]	validation-rmse:9.96230                                                    
[3]	validation-rmse:9.41481                                                    
[4]	validation-rmse:8.94825                                                    
[5]	validation-rmse:8.55231                                                    
[6]	validation-rmse:8.21941                                                    
[7]	validation-rmse:7.93875                                                    
[8]	validation-rmse:7.70319                                                    
[9]	validation-rmse:7.50704                                                    
[10]	validation-rmse:7.34260                                                   
[11]	validation-rmse:7.20634                                                   
[12]	validation-rmse:7.09261            

  self.starting_round = model.num_boosted_rounds()



[4]	validation-rmse:10.12770                                                   
[5]	validation-rmse:9.80944                                                    
[6]	validation-rmse:9.51897                                                    
[7]	validation-rmse:9.25402                                                    
[8]	validation-rmse:9.01297                                                    
[9]	validation-rmse:8.79392                                                    
[10]	validation-rmse:8.59509                                                   
[11]	validation-rmse:8.41471                                                   
[12]	validation-rmse:8.25128                                                   
[13]	validation-rmse:8.10353                                                   
[14]	validation-rmse:7.96997                                                   
[15]	validation-rmse:7.84950                                                   
[16]	validation-rmse:7.74017            

  self.starting_round = model.num_boosted_rounds()



[7]	validation-rmse:7.88109                                                    
[8]	validation-rmse:7.66873                                                    
[9]	validation-rmse:7.49451                                                    
[10]	validation-rmse:7.35331                                                   
[11]	validation-rmse:7.23697                                                   
[12]	validation-rmse:7.14242                                                   
[13]	validation-rmse:7.06507                                                   
[14]	validation-rmse:7.00151                                                   
[15]	validation-rmse:6.94929                                                   
[16]	validation-rmse:6.90510                                                   
[17]	validation-rmse:6.86881                                                   
[18]	validation-rmse:6.83929                                                   
[19]	validation-rmse:6.81432            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.13841                                                   
[1]	validation-rmse:10.24903                                                   
[2]	validation-rmse:9.51843                                                    
[3]	validation-rmse:8.92420                                                    
[4]	validation-rmse:8.44189                                                    
[5]	validation-rmse:8.05444                                                    
[6]	validation-rmse:7.74383                                                    
[7]	validation-rmse:7.49652                                                    
[8]	validation-rmse:7.30052                                                    
[9]	validation-rmse:7.14500                                                    
[10]	validation-rmse:7.02129                                                   
[11]	validation-rmse:6.92218                                                   
[12]	validation-rmse:6.84433            

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:8.23688                                                    
[2]	validation-rmse:7.48249                                                    
[3]	validation-rmse:7.09148                                                    
[4]	validation-rmse:6.89067                                                    
[5]	validation-rmse:6.77667                                                    
[6]	validation-rmse:6.71127                                                    
[7]	validation-rmse:6.67109                                                    
[8]	validation-rmse:6.64542                                                    
[9]	validation-rmse:6.62570                                                    
[10]	validation-rmse:6.61286                                                   
[11]	validation-rmse:6.59920                                                   
[12]	validation-rmse:6.59025                                                   
[13]	validation-rmse:6.58337            

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:6.60400                                                    
[2]	validation-rmse:6.58708                                                    
[3]	validation-rmse:6.58092                                                    
[4]	validation-rmse:6.57265                                                    
[5]	validation-rmse:6.56141                                                    
[6]	validation-rmse:6.55376                                                    
[7]	validation-rmse:6.54915                                                    
[8]	validation-rmse:6.53843                                                    
[9]	validation-rmse:6.52925                                                    
[10]	validation-rmse:6.52351                                                   
[11]	validation-rmse:6.51660                                                   
[12]	validation-rmse:6.50816                                                   
[13]	validation-rmse:6.50207            

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:9.40794                                                    
[2]	validation-rmse:8.55900                                                    
[3]	validation-rmse:7.96150                                                    
[4]	validation-rmse:7.54580                                                    
[5]	validation-rmse:7.25988                                                    
[6]	validation-rmse:7.06065                                                    
[7]	validation-rmse:6.92219                                                    
[8]	validation-rmse:6.82523                                                    
[9]	validation-rmse:6.75597                                                    
[10]	validation-rmse:6.70477                                                   
[11]	validation-rmse:6.66872                                                   
[12]	validation-rmse:6.63877                                                   
[13]	validation-rmse:6.61717            

  self.starting_round = model.num_boosted_rounds()



[14]	validation-rmse:6.72399                                                   
[15]	validation-rmse:6.72151                                                   
[16]	validation-rmse:6.71976                                                   
[17]	validation-rmse:6.71623                                                   
[18]	validation-rmse:6.71393                                                   
[19]	validation-rmse:6.71361                                                   
[20]	validation-rmse:6.70885                                                   
[21]	validation-rmse:6.70579                                                   
[22]	validation-rmse:6.70424                                                   
[23]	validation-rmse:6.70228                                                   
[24]	validation-rmse:6.70045                                                   
[25]	validation-rmse:6.70020                                                   
[26]	validation-rmse:6.69815            

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.73084                                                    
[1]	validation-rmse:11.28535                                                    
[2]	validation-rmse:10.87511                                                    
[3]	validation-rmse:10.49615                                                    
[4]	validation-rmse:10.14740                                                    
[5]	validation-rmse:9.82693                                                     
[6]	validation-rmse:9.53286                                                     
[7]	validation-rmse:9.26341                                                     
[8]	validation-rmse:9.01736                                                     
[9]	validation-rmse:8.79069                                                     
[10]	validation-rmse:8.58541                                                    
[11]	validation-rmse:8.39700                                                    
[12]	validation-rmse:8.22623

  self.starting_round = model.num_boosted_rounds()



[4]	validation-rmse:7.80692                                                     
[5]	validation-rmse:7.48889                                                     
[6]	validation-rmse:7.25914                                                     
[7]	validation-rmse:7.09440                                                     
[8]	validation-rmse:6.97564                                                     
[9]	validation-rmse:6.88855                                                     
[10]	validation-rmse:6.82472                                                    
[11]	validation-rmse:6.77622                                                    
[12]	validation-rmse:6.74063                                                    
[13]	validation-rmse:6.71448                                                    
[14]	validation-rmse:6.69591                                                    
[15]	validation-rmse:6.67908                                                    
[16]	validation-rmse:6.66386

  self.starting_round = model.num_boosted_rounds()



[8]	validation-rmse:6.67285                                                     
[9]	validation-rmse:6.66208                                                     
[10]	validation-rmse:6.65885                                                    
[11]	validation-rmse:6.65199                                                    
[12]	validation-rmse:6.64965                                                    
[13]	validation-rmse:6.64637                                                    
[14]	validation-rmse:6.64114                                                    
[15]	validation-rmse:6.63835                                                    
[16]	validation-rmse:6.63541                                                    
[17]	validation-rmse:6.63176                                                    
[18]	validation-rmse:6.62660                                                    
[19]	validation-rmse:6.62533                                                    
[20]	validation-rmse:6.62298

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.37329                                                    
[1]	validation-rmse:10.64581                                                    
[2]	validation-rmse:10.01770                                                    
[3]	validation-rmse:9.47891                                                     
[4]	validation-rmse:9.01847                                                     
[5]	validation-rmse:8.62652                                                     
[6]	validation-rmse:8.29370                                                     
[7]	validation-rmse:8.01289                                                     
[8]	validation-rmse:7.77637                                                     
[9]	validation-rmse:7.57659                                                     
[10]	validation-rmse:7.40887                                                    
[11]	validation-rmse:7.26935                                                    
[12]	validation-rmse:7.15215

  self.starting_round = model.num_boosted_rounds()



[4]	validation-rmse:8.80930                                                     
[5]	validation-rmse:8.43013                                                     
[6]	validation-rmse:8.11825                                                     
[7]	validation-rmse:7.86345                                                     
[8]	validation-rmse:7.65424                                                     
[9]	validation-rmse:7.48375                                                     
[10]	validation-rmse:7.34558                                                    
[11]	validation-rmse:7.23251                                                    
[12]	validation-rmse:7.14087                                                    
[13]	validation-rmse:7.06637                                                    
[14]	validation-rmse:7.00566                                                    
[15]	validation-rmse:6.95596                                                    
[16]	validation-rmse:6.91384

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:11.37562                                                    
[1]	validation-rmse:10.64953                                                    
[2]	validation-rmse:10.02208                                                    
[3]	validation-rmse:9.48235                                                     
[4]	validation-rmse:9.01878                                                     
[5]	validation-rmse:8.62307                                                     
[6]	validation-rmse:8.28754                                                     
[7]	validation-rmse:8.00382                                                     
[8]	validation-rmse:7.76260                                                     
[9]	validation-rmse:7.56112                                                     
[10]	validation-rmse:7.39097                                                    
[11]	validation-rmse:7.24920                                                    
[12]	validation-rmse:7.12916

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:9.56143                                                     
[3]	validation-rmse:8.97402                                                     
[4]	validation-rmse:8.49883                                                     
[5]	validation-rmse:8.11544                                                     
[6]	validation-rmse:7.80723                                                     
[7]	validation-rmse:7.56116                                                     
[8]	validation-rmse:7.36567                                                     
[9]	validation-rmse:7.20997                                                     
[10]	validation-rmse:7.08508                                                    
[11]	validation-rmse:6.98593                                                    
[12]	validation-rmse:6.90693                                                    
[13]	validation-rmse:6.84348                                                    
[14]	validation-rmse:6.79327

  self.starting_round = model.num_boosted_rounds()



[2]	validation-rmse:10.43070                                                    
[3]	validation-rmse:9.96508                                                     
[4]	validation-rmse:9.55171                                                     
[5]	validation-rmse:9.18774                                                     
[6]	validation-rmse:8.86690                                                     
[7]	validation-rmse:8.58763                                                     
[8]	validation-rmse:8.34068                                                     
[9]	validation-rmse:8.12533                                                     
[10]	validation-rmse:7.93679                                                    
[11]	validation-rmse:7.77216                                                    
[12]	validation-rmse:7.63031                                                    
[13]	validation-rmse:7.50781                                                    
[14]	validation-rmse:7.39779

  self.starting_round = model.num_boosted_rounds()



[3]	validation-rmse:6.99374                                                     
[4]	validation-rmse:6.82277                                                     
[5]	validation-rmse:6.73231                                                     
[6]	validation-rmse:6.67996                                                     
[7]	validation-rmse:6.64864                                                     
[8]	validation-rmse:6.62894                                                     
[9]	validation-rmse:6.61751                                                     
[10]	validation-rmse:6.60961                                                    
[11]	validation-rmse:6.60524                                                    
[12]	validation-rmse:6.59771                                                    
[13]	validation-rmse:6.59497                                                    
[14]	validation-rmse:6.59148                                                    
[15]	validation-rmse:6.58752

  self.starting_round = model.num_boosted_rounds()



[1]	validation-rmse:11.29599                                                    
[2]	validation-rmse:10.88917                                                    
[3]	validation-rmse:10.51398                                                    
[4]	validation-rmse:10.16825                                                    
[5]	validation-rmse:9.85051                                                     
[6]	validation-rmse:9.55863                                                     
[7]	validation-rmse:9.29062                                                     
[8]	validation-rmse:9.04505                                                     
[9]	validation-rmse:8.82055                                                     
[10]	validation-rmse:8.61525                                                    
[11]	validation-rmse:8.42761                                                    
[12]	validation-rmse:8.25607                                                    
[13]	validation-rmse:8.10027

KeyboardInterrupt: 

In [21]:
mlflow.xgboost.autolog(disable=True)

In [22]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

[0]	validation-rmse:11.44482


  self.starting_round = model.num_boosted_rounds()


[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773
[30]	validation-rmse:6.59777
[31]	validation-rmse:6.58875
[32]	validation-rmse:6.58107
[33]	validation-rmse:6.57217
[34]	validation-rmse:6.56557
[35]	validation-rmse:



In [23]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        

