## NYC Taxi Data Experimnt Tracking

In [1]:
!python -V

Python 3.11.8


In [2]:
import requests
import pickle

import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR

from tqdm import tqdm

In [4]:
! pwd

/Users/emelidral/Dev/hs-2024-industrial-ml


In [5]:
!mkdir data

In [7]:
%%bash
ls

README.md
[34mdata[m[m
experiment_tracking.ipynb
requirements.txt


In [8]:
files = [('green_tripdata_2023-02.parquet', './data'), ('green_tripdata_2023-01.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2023-02.parquet: 100%|█████████████| 1533740/1533740 [00:04<00:00, 367895.71it/s, save to ./data/green_tripdata_2023-02.parquet]
green_tripdata_2023-01.parquet: 100%|█████████████| 1427002/1427002 [00:03<00:00, 390224.96it/s, save to ./data/green_tripdata_2023-01.parquet]


In [9]:
train_raw_data = pd.read_parquet('data/green_tripdata_2023-01.parquet')
val_raw_data = pd.read_parquet('data/green_tripdata_2023-02.parquet')

In [10]:
train_raw_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.9,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.7,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.0
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.0,7.2,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.0
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.3,6.5,0.5,1.5,1.7,0.0,,1.0,10.2,1.0,1.0,0.0
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.1,6.0,0.5,1.5,0.0,0.0,,1.0,8.0,1.0,1.0,0.0


In [11]:
train_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68211 entries, 0 to 68210
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               68211 non-null  int64         
 1   lpep_pickup_datetime   68211 non-null  datetime64[us]
 2   lpep_dropoff_datetime  68211 non-null  datetime64[us]
 3   store_and_fwd_flag     63887 non-null  object        
 4   RatecodeID             63887 non-null  float64       
 5   PULocationID           68211 non-null  int64         
 6   DOLocationID           68211 non-null  int64         
 7   passenger_count        63887 non-null  float64       
 8   trip_distance          68211 non-null  float64       
 9   fare_amount            68211 non-null  float64       
 10  extra                  68211 non-null  float64       
 11  mta_tax                68211 non-null  float64       
 12  tip_amount             68211 non-null  float64       
 13  t

In [12]:
def process_dataframe(data):
    data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)
    data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)

    data['duration'] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
    data.duration = data.duration.apply(lambda td: td.total_seconds() / 60)
    data = data[(data.duration >= 1) & (data.duration <= 60)]
    
    data['PULocationID'].astype(str, copy=False)
    data['DOLocationID'].astype(str, copy=False)
    
    return data

In [13]:
num_features = ['trip_distance', 'extra', 'fare_amount']
cat_features = ['PULocationID', 'DOLocationID']

In [14]:
X_train = process_dataframe(train_raw_data)[num_features + cat_features]
X_val = process_dataframe(val_raw_data)[num_features + cat_features] 

y_train = process_dataframe(train_raw_data)['duration']
y_val = process_dataframe(val_raw_data)['duration'] 

In [16]:
X_train.isnull().sum()

trip_distance    0
extra            0
fare_amount      0
PULocationID     0
DOLocationID     0
dtype: int64

## Simple model

In [18]:
lr = Ridge()

lr.fit(X_train, y_train)

y_val_pred = lr.predict(X_val)
mean_squared_error(y_val, y_val_pred, squared=False)

5.242476415412269

## MLflow

In [19]:
import mlflow

In [20]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("test")

2024/02/28 11:37:36 INFO mlflow.tracking.fluent: Experiment with name 'test' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/emelidral/Dev/hs-2024-industrial-ml/mlruns/1', creation_time=1709120256919, experiment_id='1', last_update_time=1709120256919, lifecycle_stage='active', name='test', tags={}>

In [21]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "inclass")
    mlflow.log_param("train_data_path", "data/green_tripdata_2023-01.parquet")
    mlflow.log_param("vl_data_path", "data/green_tripdata_2023-02.parquet")

    alpha = 0.5
    mlflow.log_param("alpha", alpha)
    
    lr = Lasso(alpha)

    lr.fit(X_train, y_train)
    y_val_pred = lr.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)

    mlflow.log_metric("rmse", rmse)

## Hyperparameter Optimisation

In [22]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [41]:
train = xgb.DMatrix(X_train, label=y_train)
val = xgb.DMatrix(X_val, label=y_val)

In [66]:
def objective(params):
    with mlflow.start_run():
        num_rounds = 200
        early_stopping_rounds = 50

        mlflow.set_tag("workspace", "inclass")
        mlflow.set_tag("model", "xgb_fixed")
        mlflow.log_params(params)
        mlflow.log_param("train_data_path", "data/green_tripdata_2023-01.parquet")
        mlflow.log_param("vl_data_path", "data/green_tripdata_2023-02.parquet")

        booster = xgb.train(
            params=params,
            dtrain=train,
            evals=[(val, "validation")],
            num_boost_round=num_rounds,
            early_stopping_rounds=early_stopping_rounds
        )

        y_val_pred = booster.predict(val)
        rmse = mean_squared_error(y_val, y_val_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

        return {
        'loss': rmse,
        'status': STATUS_OK}

In [54]:
X_train.shape

(65946, 5)

In [67]:
grid_seach_params = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 6, 10, 1)),
    'reg_lambda': hp.loguniform('reg_lambda', -5, -1),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'objective': 'reg:squarederror', #|y - y*|
    'seed':111,
}

In [68]:
best_model = fmin(
    fn=objective,
    space=grid_seach_params,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

[0]	validation-rmse:7.41054                                                                                                                    
[1]	validation-rmse:6.19673                                                                                                                    
[2]	validation-rmse:5.48130                                                                                                                    
[3]	validation-rmse:5.05621                                                                                                                    
[4]	validation-rmse:4.79038                                                                                                                    
[5]	validation-rmse:4.61307                                                                                                                    
[6]	validation-rmse:4.50720                                                                                                             

## Autolog

In [71]:
best_params = {
    'max_depth':37,
    'max_leaves': 4,
    'reg_lambda': 0.11373501935624511,
    'reg_alpha': 0.036518723152379994,
    'objective': 'reg:squarederror', 
    'seed':111,
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=best_params,
    dtrain=train,
    evals=[(val, "validation")],
    num_boost_round=200,
    early_stopping_rounds=50
)

2024/02/28 12:42:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1c5ae6ec23d8422fbb6fb0ca76a05307', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:7.51286
[1]	validation-rmse:6.36526
[2]	validation-rmse:5.62723
[3]	validation-rmse:5.19291
[4]	validation-rmse:4.92657
[5]	validation-rmse:4.76907
[6]	validation-rmse:4.65459
[7]	validation-rmse:4.58865
[8]	validation-rmse:4.54504
[9]	validation-rmse:4.50514
[10]	validation-rmse:4.47385
[11]	validation-rmse:4.45390
[12]	validation-rmse:4.41816
[13]	validation-rmse:4.38865
[14]	validation-rmse:4.37231
[15]	validation-rmse:4.35497
[16]	validation-rmse:4.34108
[17]	validation-rmse:4.33472
[18]	validation-rmse:4.32534
[19]	validation-rmse:4.31998
[20]	validation-rmse:4.31407
[21]	validation-rmse:4.29691
[22]	validation-rmse:4.28100
[23]	validation-rmse:4.27169
[24]	validation-rmse:4.26724
[25]	validation-rmse:4.25901
[26]	validation-rmse:4.24976
[27]	validation-rmse:4.24290
[28]	validation-rmse:4.24017
[29]	validation-rmse:4.23138
[30]	validation-rmse:4.22622
[31]	validation-rmse:4.22203
[32]	validation-rmse:4.22068
[33]	validation-rmse:4.21702
[34]	validation-rmse:4.2



## Load Model

In [73]:
logged_model_id = 'runs:/1c5ae6ec23d8422fbb6fb0ca76a05307/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model_id)

In [74]:
type(loaded_model)

mlflow.pyfunc.PyFuncModel

In [77]:
y_val_pred_load_model =loaded_model.predict(X_val)

In [78]:
rmse = mean_squared_error(y_val, y_val_pred_load_model, squared=False)

In [79]:
rmse

3.9202611413076642

## Mlflow Client

In [80]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [81]:
MLFLOW_URI = "sqlite:///mlflow.db"

In [82]:
client = MlflowClient(MLFLOW_URI)

In [83]:
client.create_experiment(name = "New experimnet created from the python API")

'2'

In [84]:
runs = client.search_runs(
    experiment_ids='1',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10
)

In [100]:
runs[0].data.metrics["validation-rmse"]

3.9202611363663564

In [102]:
for run in runs:
    print(f"run_id={run.info.run_id}, rmse={run.data.metrics}")

run_id=1c5ae6ec23d8422fbb6fb0ca76a05307, rmse={'validation-rmse': 3.9202611363663564, 'stopped_iteration': 199.0, 'best_iteration': 199.0}
run_id=e5f48f4505e54acba7969053f8bdcddb, rmse={'rmse': 3.9202611413076642}
run_id=f11a6decf30a4f81a1716e9705c92984, rmse={'rmse': 4.086666325205767}
run_id=724183a0fa9045f3990723687a72cdf7, rmse={'rmse': 3.8736398834404864}
run_id=c0162ccb9bc9483a892c5a5437b677a6, rmse={'rmse': 3.8796079918639896}
run_id=5e9324c9b56244abbfd49fbc3bb3e6aa, rmse={'rmse': 4.080169670113804}
run_id=0bebd74ae0d94d5cb7b470d09ed43dc8, rmse={'rmse': 3.7728073068070977}
run_id=8e1e5c73e11b43fd965a76c90eda794c, rmse={'rmse': 3.9440387616463783}
run_id=167ba9bbcb6a49f9a18bb05de9d207d5, rmse={'rmse': 3.8736311363642715}
run_id=18d2b51a6bff4f85b2a206c98973cdbf, rmse={'rmse': 3.6881730905210497}
