In [1]:
import pandas as pd

In [2]:
import sklearn

In [3]:

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error


In [4]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("my_experiment")

print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment ID: {mlflow.get_experiment_by_name('my_experiment').experiment_id}")

2025/05/10 14:41:46 INFO mlflow.tracking.fluent: Experiment with name 'my_experiment' does not exist. Creating a new experiment.


Tracking URI: sqlite:///mlflow.db
Experiment ID: 1


In [4]:
df = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')


In [5]:
len(df.columns)

19

In [8]:
df.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [None]:
42.59
std      4.259435e+01

In [7]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [9]:
df = df[(df.duration >= 1) & (df.duration <= 60)]


In [10]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [10]:
len(df)

3009173

In [11]:
3009173/3066766

0.9812202822125979

In [12]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)


In [13]:
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [15]:
X_train.shape

(3009173, 515)

In [16]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.649261932106969

In [None]:
df

In [None]:
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [None]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

In [None]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(y_pred, label='prediction')
sns.distplot(y_train, label='actual')

plt.legend()

In [8]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-20.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (42.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-20.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [9]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m[31m6.7 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading cramjam-2.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed cramjam-2.10.0 fastparquet-2024.11.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49

Q6. Evaluating the model


In [5]:
import pandas as pd
def read_dataframe(filename):
    df = pd.read_parquet(filename, columns=['trip_distance','tpep_dropoff_datetime', 'tpep_pickup_datetime', 'PULocationID', 'DOLocationID'])

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [6]:
df_train = read_dataframe('../01-intro/data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('../01-intro/data/yellow_tripdata_2023-02.parquet')

In [7]:
len(df_train), len(df_val)


(3009173, 2855951)

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical =  ['PULocationID', 'DOLocationID'] #['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)


7.820259863004852

In [12]:
import pickle
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [12]:
!pwd

/workspaces/mlops-zoomcamp/02-experiment-tracking


In [9]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Duman")

    mlflow.log_param("train_path", "data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("val_path", "data/yellow_tripdata_2023-02.parquet")

    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

In [11]:
import xgboost as xgb


In [12]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [16]:
from sklearn.metrics import mean_squared_error


In [13]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [15]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()



[0]	validation-rmse:7.97700                           
[1]	validation-rmse:6.61226                           
[2]	validation-rmse:5.77668                           
[3]	validation-rmse:5.27266                           
[4]	validation-rmse:4.98983                           
[5]	validation-rmse:4.82649                           
[6]	validation-rmse:4.73911                           
[7]	validation-rmse:4.68806                           
[8]	validation-rmse:4.66142                           
[9]	validation-rmse:4.64668                           
[10]	validation-rmse:4.63765                          
[11]	validation-rmse:4.63391                          
[12]	validation-rmse:4.63184                          
[13]	validation-rmse:4.63152                          
[14]	validation-rmse:4.63180                          
[15]	validation-rmse:4.63248                          
[16]	validation-rmse:4.63361                          
[17]	validation-rmse:4.63525                          
[18]	valid

job exception: name 'mean_squared_error' is not defined



  0%|          | 0/50 [03:39<?, ?trial/s, best loss=?]


NameError: name 'mean_squared_error' is not defined

In [17]:
mlflow.xgboost.autolog(disable=True)


In [21]:
import pickle

In [22]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:9.34798
[1]	validation-rmse:8.71461
[2]	validation-rmse:8.15750
[3]	validation-rmse:7.66878
[4]	validation-rmse:7.24278
[5]	validation-rmse:6.87279
[6]	validation-rmse:6.55280
[7]	validation-rmse:6.27351
[8]	validation-rmse:6.03615
[9]	validation-rmse:5.83215
[10]	validation-rmse:5.65607
[11]	validation-rmse:5.50742
[12]	validation-rmse:5.38152
[13]	validation-rmse:5.27467
[14]	validation-rmse:5.18091
[15]	validation-rmse:5.10423
[16]	validation-rmse:5.03863
[17]	validation-rmse:4.98333
[18]	validation-rmse:4.93645
[19]	validation-rmse:4.89665
[20]	validation-rmse:4.86284
[21]	validation-rmse:4.83233
[22]	validation-rmse:4.80801
[23]	validation-rmse:4.78633
[24]	validation-rmse:4.76663
[25]	validation-rmse:4.75023
[26]	validation-rmse:4.73620
[27]	validation-rmse:4.72441
[28]	validation-rmse:4.71360
[29]	validation-rmse:4.70176
[30]	validation-rmse:4.69350
[31]	validation-rmse:4.68619
[32]	validation-rmse:4.67969
[33]	validation-rmse:4.67432
[34]	validation-rmse:4.6

  xgb_model.save_model(model_data_path)


In [23]:
dv

In [1]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)


In [3]:
client

<mlflow.tracking.client.MlflowClient at 0x7bc89287db50>

In [4]:
client.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1746888106375, experiment_id='1', last_update_time=1746888106375, lifecycle_stage='active', name='my_experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1746885449570, experiment_id='0', last_update_time=1746885449570, lifecycle_stage='active', name='Default', tags={}>]

In [5]:
client.create_experiment("cool_experiment")

'2'

In [15]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=["1"],
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [17]:
runs

[<Run: data=<RunData: metrics={'rmse': 8.137900681701906}, params={'alpha': '0.01',
  'train_path': 'data/yellow_tripdata_2023-01.parquet',
  'val_path': 'data/yellow_tripdata_2023-02.parquet'}, tags={'developer': 'Duman',
  'mlflow.runName': 'orderly-duck-662',
  'mlflow.source.name': '/home/codespace/.local/lib/python3.12/site-packages/ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'codespace'}>, info=<RunInfo: artifact_uri='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/8716abab92a94e249e04ae8df65193e2/artifacts', end_time=1746888188859, experiment_id='1', lifecycle_stage='active', run_id='8716abab92a94e249e04ae8df65193e2', run_name='orderly-duck-662', run_uuid='8716abab92a94e249e04ae8df65193e2', start_time=1746888162840, status='FINISHED', user_id='codespace'>, inputs=<RunInputs: dataset_inputs=[]>>,
 <Run: data=<RunData: metrics={'rmse': 20.71977313948911}, params={'learning_rate': '0.09585355369315604',
  'max_depth': '30',
  'min_child_weig

In [19]:
for run in runs:
    print(run.info.run_id, run.data.metrics["rmse"])

8716abab92a94e249e04ae8df65193e2 8.137900681701906
5cba55ecfda14699b6eaca4639c03510 20.71977313948911
d84ce5ef4a2b4728bf007a62daafd164 20.71977313948911


KeyError: 'rmse'

'2.22.0'

In [None]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [21]:
run_id = "8716abab92a94e249e04ae8df65193e2"
model_uri = f"runs:/{run_id}/models"
  

In [22]:
mlflow.register_model(model_uri=model_uri,name="xgboost_model")

Successfully registered model 'xgboost_model'.
Created version '1' of model 'xgboost_model'.


<ModelVersion: aliases=[], creation_timestamp=1747909333457, current_stage='None', description=None, last_updated_timestamp=1747909333457, name='xgboost_model', run_id='8716abab92a94e249e04ae8df65193e2', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/8716abab92a94e249e04ae8df65193e2/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [26]:
client.search_registered_models()


[<RegisteredModel: aliases={}, creation_timestamp=1747909333444, description=None, last_updated_timestamp=1747909333457, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1747909333457, current_stage='None', description=None, last_updated_timestamp=1747909333457, name='xgboost_model', run_id='8716abab92a94e249e04ae8df65193e2', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/8716abab92a94e249e04ae8df65193e2/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='xgboost_model', tags={}>]

In [27]:
model_name = "xgboost_model"
client.get_latest_versions(model_name)

  client.get_latest_versions(model_name)


[<ModelVersion: aliases=[], creation_timestamp=1747909333457, current_stage='None', description=None, last_updated_timestamp=1747909333457, name='xgboost_model', run_id='8716abab92a94e249e04ae8df65193e2', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/8716abab92a94e249e04ae8df65193e2/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>]

In [28]:
client.transition_model_version_stage(
    name=model_name,
    version=1,
    stage="Staging",
    archive_existing_versions=False
)


  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1747909333457, current_stage='Staging', description=None, last_updated_timestamp=1747910534478, name='xgboost_model', run_id='8716abab92a94e249e04ae8df65193e2', run_link=None, source='/workspaces/mlops-zoomcamp/02-experiment-tracking/mlruns/1/8716abab92a94e249e04ae8df65193e2/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [29]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [None]:
df = read_dataframe("data/green_tripdata_2021-03.csv")


In [None]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


In [None]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [None]:
X_test = preprocess(df, dv)
target = "duration"
y_test = df[target].values

In [None]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)


In [None]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)


In [None]:
client.transition_model_version_stage(
    name=model_name,
    version=4,
    stage="Production",
    archive_existing_versions=True
)