In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction import DictVectorizer
from mlflow.tracking import MlflowClient
import mlflow


In [2]:
mlflow.set_tracking_uri("sqlite:///C:/Users/LENOVO/Documents/mlops-zoomcamp/mlflow.db")

mlflow.set_experiment("mlops-zoomcamp-experiment-tracking")

<Experiment: artifact_location='file:///c:/Users/LENOVO/Documents/mlops-zoomcamp/mlops-zoomcamp/01-intro/mlruns/1', creation_time=1716923969588, experiment_id='1', last_update_time=1716923969588, lifecycle_stage='active', name='mlops-zoomcamp-experiment-tracking', tags={}>

Read Dataset

In [3]:
df = pd.read_parquet(r'C:\Users\LENOVO\Documents\mlops-zoomcamp\mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-01.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
#import the val data
val_df = pd.read_parquet(r'C:\Users\LENOVO\Documents\mlops-zoomcamp\mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-02.parquet')


Data Cleaning and Processing

In [5]:
#create the target column for training
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
#convert duration to minutes
df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)
df['duration']

0           8.433333
1           6.316667
2          12.750000
3           9.616667
4          10.833333
             ...    
3066761    13.983333
3066762    19.450000
3066763    24.516667
3066764    13.000000
3066765    14.400000
Name: duration, Length: 3066766, dtype: float64

In [6]:
#create the target column for validation
val_df['duration'] = val_df['tpep_dropoff_datetime'] - val_df['tpep_pickup_datetime']
#convert duration to minutes
val_df['duration'] = val_df.duration.apply(lambda td: td.total_seconds() / 60)


In [7]:
#delete trips that are less then one minute or longer than 60 minutes
total_records = len(df)
new_df = df[(df.duration >= 1) & (df.duration <= 60)]
new_df_records = len(new_df)
fraction =   new_df_records / total_records
print(f'The fraction {fraction}')


The fraction 0.9812202822125979


In [8]:
val_df = val_df[(val_df.duration >= 1) & (val_df.duration <= 60)]


One-Hot Encoding

In [9]:
#convert the two columns to string
category = ['PULocationID', 'DOLocationID']
new_df[category] = new_df[category].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[category] = new_df[category].astype(str)


In [10]:
val_df[category] = val_df[category].astype(str)


In [11]:
#apply one-hot encoding
train_dicts = new_df[category].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = new_df['duration'].values


In [12]:
#apply one-hot encoding on validation data
val_dicts = val_df[category].to_dict(orient='records')

#transform the val data
X_val = dv.transform(val_dicts)

#predict the duration
y_val = val_df['duration'].values



Train and Evalute model

In [13]:
from sklearn.linear_model import LinearRegression
#import root mean square error
from sklearn.metrics import mean_squared_error
# from sklearn.metrics import root_mean_squared_error

from sklearn.linear_model import Lasso


In [14]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope


In [15]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)


Log one metric each time

In [20]:
# Define the objective function for hyperparameter optimization
def objective(params):
    # Start a new MLflow run
    with mlflow.start_run():
        # Set a tag for the model type
        mlflow.set_tag("model", "xgboost")
        
        # Log the hyperparameters being used
        mlflow.log_params(params)
        
        # Train the XGBoost model with the given parameters
        booster = xgb.train(
            params=params,                 # Hyperparameters for the model
            dtrain=train,                  # Training data
            num_boost_round=1000,          # Maximum number of boosting rounds
            evals=[(valid, 'validation')], # Evaluation dataset and its name
            early_stopping_rounds=50       # Early stopping after 50 rounds without improvement
        )
        
        # Make predictions on the validation set
        y_pred = booster.predict(valid)
        
        # Calculate the Root Mean Squared Error (RMSE) on the validation set
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Log the RMSE metric to MLflow
        mlflow.log_metric("rmse", rmse)

    # Return the RMSE as the loss and the status as successful
    return {'loss': rmse, 'status': STATUS_OK}


In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=5,
    trials=Trials()
)


mlflow autologing

In [21]:
with mlflow.start_run():
    best_params = {
    'max_depth': 50,
    'learning_rate': 1,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'min_child_weight':1,
    'objective': 'reg:linear',
    'seed': 42
    }
    
    mlflow.xgboost.autolog()
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=50,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )





[0]	validation-rmse:7.23540
[1]	validation-rmse:6.44067
[2]	validation-rmse:6.00743
[3]	validation-rmse:5.90210
[4]	validation-rmse:5.62946
[5]	validation-rmse:5.56095
[6]	validation-rmse:5.49465
[7]	validation-rmse:5.40951
[8]	validation-rmse:5.31606
[9]	validation-rmse:5.29363
[10]	validation-rmse:5.26692
[11]	validation-rmse:5.25443
[12]	validation-rmse:5.23968
[13]	validation-rmse:5.23724
[14]	validation-rmse:5.23247
[15]	validation-rmse:5.22992
[16]	validation-rmse:5.22767
[17]	validation-rmse:5.22628
[18]	validation-rmse:5.22216
[19]	validation-rmse:5.21956
[20]	validation-rmse:5.21848
[21]	validation-rmse:5.21824
[22]	validation-rmse:5.21743
[23]	validation-rmse:5.21715
[24]	validation-rmse:5.21650
[25]	validation-rmse:5.21608
[26]	validation-rmse:5.21580
[27]	validation-rmse:5.21571
[28]	validation-rmse:5.21574
[29]	validation-rmse:5.21588
[30]	validation-rmse:5.21594
[31]	validation-rmse:5.21596
[32]	validation-rmse:5.21585
[33]	validation-rmse:5.21576
[34]	validation-rmse:5.2



Model Registry

In [23]:
y_train = new_df['duration'].values

with mlflow.start_run():
    #experiment tag
    mlflow.set_tag("experiment tracking", "model registry")
    #log the data path
    mlflow.log_param("train-path", r"mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-01.parquet")
    mlflow.log_param("val-path", r"mlops-zoomcamp\mlops-zoomcamp\01-intro\data\yellow_tripdata_2023-02.parquet")
    # log alpha param
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    #Train the model
    lr = Lasso(alpha=alpha)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    #log metric
    mlflow.log_metric("rmse", rmse)
    with open(r'model\lasso_reg.bin', 'wb') as f_out:
        pickle.dump((dv, lr), f_out)

    # log model as an artifcat 
    mlflow.log_artifact(local_path="model/lasso_reg.bin", artifact_path="models_pickle")

    

In [24]:
mlflow.xgboost.autolog(disable=True)


In [16]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=10,
        evals=[(valid, 'validation')],
        early_stopping_rounds=5
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("model/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("model/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")




[0]	validation-rmse:9.66700
[1]	validation-rmse:9.32544
[2]	validation-rmse:9.02772
[3]	validation-rmse:8.77833
[4]	validation-rmse:8.52259
[5]	validation-rmse:8.33312
[6]	validation-rmse:8.17951
[7]	validation-rmse:8.00152
[8]	validation-rmse:7.89076
[9]	validation-rmse:7.79476


