## NYC Taxi Data Experimnt Tracking

In [15]:
xgb.config(verbo)

[0;31mType:[0m        module
[0;31mString form:[0m <module 'xgboost.config' from '/opt/homebrew/Caskroom/miniconda/base/envs/m13/lib/python3.11/site-packages/xgboost/config.py'>
[0;31mFile:[0m        /opt/homebrew/Caskroom/miniconda/base/envs/m13/lib/python3.11/site-packages/xgboost/config.py
[0;31mSource:[0m     
[0;31m# pylint: disable=missing-function-docstring[0m[0;34m[0m
[0;34m[0m[0;34m"""Global configuration for XGBoost"""[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mctypes[0m[0;34m[0m
[0;34m[0m[0;32mimport[0m [0mjson[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0mcontextlib[0m [0;32mimport[0m [0mcontextmanager[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0mfunctools[0m [0;32mimport[0m [0mwraps[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0mtyping[0m [0;32mimport[0m [0mAny[0m[0;34m,[0m [0mCallable[0m[0;34m,[0m [0mDict[0m[0;34m,[0m [0mIterator[0m[0;34m,[0m [0mOptional[0m[0;34m,[0m [0mcast[0m[0;34m[0m
[0;34m[0m[0;34m[0m


In [None]:
!python -V

In [4]:
import requests
import pickle

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR

from tqdm import tqdm

In [None]:
! mkdir data

In [3]:
files = [('green_tripdata_2024-03.parquet', './data'), ('green_tripdata_2024-04.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|█| 1372372/1372372 [00:03<00:00, 442488.81it/s, save 
green_tripdata_2024-04.parquet: 100%|█| 1346502/1346502 [00:03<00:00, 433990.51it/s, save 


In [6]:
train_raw_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')
val_raw_data = pd.read_parquet('data/green_tripdata_2024-04.parquet')

In [7]:
train_raw_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.8,1.0,0.5,3.06,0.0,,1.0,18.36,1.0,1.0,0.0
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.7,1.0,0.5,0.0,0.0,,1.0,20.2,2.0,1.0,0.0
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.3,1.0,0.5,3.5,0.0,,1.0,32.05,1.0,1.0,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.0,22.5,0.0,1.5,0.0,0.0,,1.0,24.0,1.0,1.0,0.0
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.6,1.0,0.5,1.0,0.0,,1.0,12.1,1.0,1.0,0.0


In [8]:
def process_dataframe(data):
    data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)
    data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)

    data['duration'] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
    data.duration = data.duration.apply(lambda td: td.total_seconds() / 60)
    data = data[(data.duration >= 1) & (data.duration <= 90)]
    
    data['PULocationID'].astype(str, copy=False)
    data['DOLocationID'].astype(str, copy=False)
    
    return data

In [9]:
num_features = ['trip_distance', 'extra', 'fare_amount']
cat_features = ['PULocationID', 'DOLocationID']

In [10]:
X_train = process_dataframe(train_raw_data)[num_features + cat_features]
X_val = process_dataframe(val_raw_data)[num_features + cat_features] 

y_train = process_dataframe(train_raw_data)['duration']
y_val = process_dataframe(val_raw_data)['duration'] 

In [7]:
X_val.isnull().sum()

trip_distance    0
extra            0
fare_amount      0
PULocationID     0
DOLocationID     0
dtype: int64

## Simple Experimnet 

In [8]:
lr = Ridge()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



5.907885082410718

In [9]:
np.mean(y_pred)

14.481896678592031

## MLflow tracking

In [8]:
import mlflow

In [9]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("green_taxi")

<Experiment: artifact_location='/Users/emelidral/Dev/m13_2024/experiments/mlruns/1', creation_time=1721062613079, experiment_id='1', last_update_time=1721062613079, lifecycle_stage='active', name='green_taxi', tags={}>

In [12]:
!ls -la  /Users/emelidral/Dev/m13_2024/experiments/mlruns/

total 0
drwxr-xr-x  4 emelidral  staff  128 15 Jul 20:12 [34m.[m[m
drwxr-xr-x  8 emelidral  staff  256 16 Jul 12:47 [34m..[m[m
drwxr-xr-x  2 emelidral  staff   64 15 Jul 20:12 [34m.trash[m[m
drwxr-xr-x  6 emelidral  staff  192 15 Jul 20:15 [34m1[m[m


In [25]:
! pwd

/Users/emelidral/Dev/m13_2024/experiments


In [13]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "inclass")
    mlflow.log_param("model_name", "Lasso")
    mlflow.log_param("train_data", "data/green_tripdata_2024-03.parquet")

    alpha = 0.05
    lr = Lasso(alpha=alpha)

    mlflow.log_param("alpha", alpha)
    
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)    

In [39]:
from hyperopt import fmin, tpe, hp
best = fmin(fn=lambda x: x ** 2,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100)
print(best)

100%|███████████| 100/100 [00:00<00:00, 2156.55trial/s, best loss: 1.3992400834395533e-05]
{'x': 0.0037406417677178783}


In [42]:
!pip install xgboost



In [2]:
import xgboost as xgb

In [3]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [11]:
train = xgb.DMatrix(X_train, label=y_train)
validation = xgb.DMatrix(X_val, label=y_val)

In [12]:
validation.get_label()

array([16.666666,  9.9     , 11.166667, ..., 11.2     , 12.      ,
        7.      ], dtype=float32)

In [13]:
grid_search = {
    'max_depth':scope.int(hp.quniform('max_depth', 4, 50, 1)),
    'reg_alpha':hp.loguniform('reg_alpha', -5, -1)
}

In [31]:
def objective(params):
    with mlflow.start_run():
        num_boost_round = 100
        early_stopping_rounds = 50
        mlflow.log_param('num_boost_round', num_boost_round) 
        mlflow.log_param('early_stopping_rounds', early_stopping_rounds) 
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain=train,
            evals=[(validation, "validation")],
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds  
        )

        y_pred = booster.predict(validation)
        rmse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric('rmse', rmse)
        return {'loss':rmse, 'status':STATUS_OK}

In [36]:
best = fmin(
    fn = objective,
    space=grid_search,
    algo=tpe.suggest,
    max_evals=10,
    trials = Trials())

[0]	validation-rmse:7.66969                                                               
[1]	validation-rmse:6.10889                                                               
[2]	validation-rmse:5.19475                                                               
[3]	validation-rmse:4.66806                                                               
[4]	validation-rmse:4.38434                                                               
[5]	validation-rmse:4.25302                                                               
[6]	validation-rmse:4.20246                                                               
[7]	validation-rmse:4.18548                                                               
[8]	validation-rmse:4.19285                                                               
[9]	validation-rmse:4.20390                                                               
[10]	validation-rmse:4.21793                                                              

In [41]:
best_params = {
    "max_depth":6,
    "reg_alpha":0.07465666333107646
}

mlflow.xgboost.autolog()

booster = xgb.train(
            params=best_params,
            dtrain=train,
            evals=[(validation, "validation")],
            num_boost_round=100,
            early_stopping_rounds=50  
        )

2024/07/15 20:15:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '156bf8a4b1474266bad2244e85f60161', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:7.75239
[1]	validation-rmse:6.23769
[2]	validation-rmse:5.31209
[3]	validation-rmse:4.78718
[4]	validation-rmse:4.47202
[5]	validation-rmse:4.30772
[6]	validation-rmse:4.17385
[7]	validation-rmse:4.10846
[8]	validation-rmse:4.05886
[9]	validation-rmse:4.03666
[10]	validation-rmse:4.00153
[11]	validation-rmse:3.97287
[12]	validation-rmse:3.96543
[13]	validation-rmse:3.94011
[14]	validation-rmse:3.93313
[15]	validation-rmse:3.92033
[16]	validation-rmse:3.92146
[17]	validation-rmse:3.92034
[18]	validation-rmse:3.90524
[19]	validation-rmse:3.90141
[20]	validation-rmse:3.89943
[21]	validation-rmse:3.89284
[22]	validation-rmse:3.89436
[23]	validation-rmse:3.88783
[24]	validation-rmse:3.88382
[25]	validation-rmse:3.88711
[26]	validation-rmse:3.88973
[27]	validation-rmse:3.89018
[28]	validation-rmse:3.89055
[29]	validation-rmse:3.89335
[30]	validation-rmse:3.87959
[31]	validation-rmse:3.87042
[32]	validation-rmse:3.86457
[33]	validation-rmse:3.86259
[34]	validation-rmse:3.8



## Load Model

In [14]:
logged_model_id = 'runs:/156bf8a4b1474266bad2244e85f60161/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model_id)

In [15]:
type(loaded_model)

mlflow.pyfunc.PyFuncModel

In [16]:
y_val_pred_load_model = loaded_model.predict(X_val)

In [17]:
rmse = mean_squared_error(y_val, y_val_pred_load_model, squared=False)
rmse



3.8768980813085387

## Mlflow Client

In [18]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [19]:
MLFLOW_URI = "sqlite:///mlflow.db"

In [20]:
client = MlflowClient(MLFLOW_URI)

In [23]:
client.create_experiment(name = "New experimnet created from the python API")

'3'

In [24]:
client.get_experiment_by_name("green_taxi")

<Experiment: artifact_location='/Users/emelidral/Dev/m13_2024/experiments/mlruns/1', creation_time=1721062613079, experiment_id='1', last_update_time=1721062613079, lifecycle_stage='active', name='green_taxi', tags={}>

In [28]:
runs = client.search_runs(
    experiment_ids='1',
    max_results=10
)

In [29]:
len(runs)

10

In [30]:
for run in runs:
    print(run)

<Run: data=<RunData: metrics={'rmse': 34.90734333193174}, params={'alpha': '0.05',
 'model_name': 'Lasso',
 'train_data': 'data/green_tripdata_2024-03.parquet'}, tags={'mlflow.runName': 'angry-hare-180',
 'mlflow.source.name': '/opt/homebrew/Caskroom/miniconda/base/envs/m13/lib/python3.11/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'emelidral',
 'workspace': 'inclass'}>, info=<RunInfo: artifact_uri='/Users/emelidral/Dev/m13_2024/experiments/mlruns/1/1e8c2ca3d01f4a0592a3589818e6504e/artifacts', end_time=1721133662165, experiment_id='1', lifecycle_stage='active', run_id='1e8c2ca3d01f4a0592a3589818e6504e', run_name='angry-hare-180', run_uuid='1e8c2ca3d01f4a0592a3589818e6504e', start_time=1721133662088, status='FINISHED', user_id='emelidral'>, inputs=<RunInputs: dataset_inputs=[]>>
<Run: data=<RunData: metrics={'best_iteration': 55.0,
 'stopped_iteration': 99.0,
 'validation-rmse': 3.8391727821808983}, params={'custom_metric': 'None',
 'early_stopp

In [31]:
for run in runs:
    print(f"run_id={run.info.run_id}, rmse={run.data.metrics}")

run_id=1e8c2ca3d01f4a0592a3589818e6504e, rmse={'rmse': 34.90734333193174}
run_id=156bf8a4b1474266bad2244e85f60161, rmse={'validation-rmse': 3.8391727821808983, 'stopped_iteration': 99.0, 'best_iteration': 55.0}
run_id=ddf21f2b704447a09222f1288179c56e, rmse={'validation-rmse': 4.0366592713094285}
run_id=eb5dce3d3d3b4d388ce1ec2892d83611, rmse={'validation-rmse': 3.8391727821808983, 'stopped_iteration': 99.0, 'best_iteration': 55.0}
run_id=c0568075b89646b692652e001a39a74a, rmse={'validation-rmse': 3.8391727821808983, 'stopped_iteration': 99.0, 'best_iteration': 55.0}
run_id=d7972c0e514a4833bf6c127fe38fea18, rmse={'rmse': 18.019460270301373}
run_id=2d75d87af3fe4c559c00145ae2cfe881, rmse={'rmse': 18.347664694490177}
run_id=2e04b61c569a438fa4321d0762f077d7, rmse={'rmse': 15.102428116729694}
run_id=d81c610f0ac543ac81a88ea8b290a933, rmse={'rmse': 18.191008022858792}
run_id=5459119afb674356ac12ce570597bf18, rmse={'rmse': 14.63216015221906}


In [56]:
for run in runs:
    if 'validation-rmse' in run.data.metrics:
        print(f"run_id={run.info.run_id}, rmse={run.data.metrics['validation-rmse']}")

run_id=156bf8a4b1474266bad2244e85f60161, rmse=3.8391727821808983


In [57]:
logged_model_id = 'runs:/156bf8a4b1474266bad2244e85f60161/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model_id)