## NYC Taxi Data Experimnt Tracking

In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [3]:
train_raw_data = pd.read_parquet('green_tripdata_2021-01.parquet')
val_raw_data = pd.read_parquet('green_tripdata_2021-02.parquet')

In [4]:
train_raw_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.0,-52.0,0.0,-0.5,0.0,0.0,,-0.3,-52.8,3.0,1.0,0.0


In [5]:
train_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               76518 non-null  int64         
 1   lpep_pickup_datetime   76518 non-null  datetime64[ns]
 2   lpep_dropoff_datetime  76518 non-null  datetime64[ns]
 3   store_and_fwd_flag     40471 non-null  object        
 4   RatecodeID             40471 non-null  float64       
 5   PULocationID           76518 non-null  int64         
 6   DOLocationID           76518 non-null  int64         
 7   passenger_count        40471 non-null  float64       
 8   trip_distance          76518 non-null  float64       
 9   fare_amount            76518 non-null  float64       
 10  extra                  76518 non-null  float64       
 11  mta_tax                76518 non-null  float64       
 12  tip_amount             76518 non-null  float64       
 13  t

In [6]:
def process_dataframe(data):
    data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)
    data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)

    data['duration'] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
    data.duration = data.duration.apply(lambda td: td.total_seconds() / 60)
    data = data[(data.duration >= 1) & (data.duration <= 60)]
    
    data['PULocationID'].astype(str, copy=False)
    data['DOLocationID'].astype(str, copy=False)
    
    return data

In [7]:
num_features = ['trip_distance', 'extra', 'fare_amount']
cat_features = ['PULocationID', 'DOLocationID']

In [8]:
X_train = process_dataframe(train_raw_data)[num_features + cat_features]
X_val = process_dataframe(val_raw_data)[num_features + cat_features] 

y_train = process_dataframe(train_raw_data)['duration']
y_val = process_dataframe(val_raw_data)['duration'] 

In [35]:
x_dict = X_train.to_dict()

In [36]:
x_dict

{'trip_distance': {0: 1.01,
  1: 2.53,
  2: 1.12,
  3: 1.99,
  7: 0.45,
  9: 12.19,
  10: 3.39,
  11: 6.69,
  12: 2.34,
  13: 5.48,
  14: 0.9,
  16: 2.08,
  17: 4.64,
  18: 1.68,
  19: 0.68,
  20: 2.7,
  21: 29.07,
  23: 2.78,
  24: 2.25,
  25: 0.0,
  26: 1.03,
  27: 0.65,
  28: 2.51,
  29: 7.57,
  30: 3.64,
  31: 0.63,
  32: 1.5,
  33: 2.83,
  34: 5.03,
  35: 3.13,
  36: 2.24,
  37: 2.11,
  38: 1.6,
  39: 0.77,
  40: 1.46,
  41: 5.19,
  42: 10.05,
  44: 0.77,
  46: 4.1,
  47: 1.07,
  48: 0.0,
  49: 0.0,
  50: 2.86,
  51: 3.56,
  52: 0.59,
  53: 5.82,
  54: 2.2,
  55: 1.96,
  56: 8.7,
  57: 2.14,
  58: 0.0,
  59: 0.0,
  60: 1.3,
  61: 5.83,
  63: 4.69,
  64: 2.01,
  65: 0.43,
  66: 0.95,
  67: 2.73,
  68: 1.55,
  69: 0.57,
  70: 3.52,
  72: 0.0,
  73: 5.09,
  74: 4.76,
  75: 6.67,
  76: 6.25,
  77: 0.09,
  78: 28.26,
  80: 9.18,
  81: 0.0,
  82: 0.0,
  83: 1.01,
  84: 1.22,
  85: 1.18,
  86: 3.61,
  87: 0.62,
  88: 3.43,
  92: 1.65,
  93: 1.03,
  94: 0.0,
  95: 0.64,
  96: 0.65,
  97: 

In [33]:
new_x_frame = pd.DataFrame.from_dict(x_dict)

In [34]:
new_x_frame

Unnamed: 0,trip_distance,extra,fare_amount,PULocationID,DOLocationID
0,1.01,0.50,5.50,43,151
1,2.53,0.50,10.00,166,239
2,1.12,0.50,6.00,41,42
3,1.99,0.50,8.00,168,75
7,0.45,0.50,3.50,75,75
...,...,...,...,...,...
76513,17.63,2.75,56.23,81,90
76514,18.36,0.00,46.66,35,213
76515,2.50,2.75,18.95,74,69
76516,14.48,2.75,48.87,168,215


In [None]:
X_val.isnull().sum()

## Simple Experimnet 

In [None]:
lr = Ridge()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

## MLflow tracking

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-data-experiment")

In [None]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "in_class")
    mlflow.log_param("train_data_name", "green_tripdata_2021-01.parquet")
    mlflow.log_param("validation_data_name", "green_tripdata_2021-02.parquet")
    
    alpha = 0.99
    mlflow.log_param("alpha", alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

## Hyperparameters Optimization

In [None]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-data-experiment")

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
validation = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        num_boost_round = 500
        early_stopping_rounds = 50
        
        mlflow.log_params(params)
        mlflow.log_param('num_boost_round', num_boost_round)
        mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
        mlflow.log_param('train_data_name', 'green_tripdata_2021-01.parquet')
        mlflow.log_param('validation_data_name', 'green_tripdata_2021-02.parquet')
        mlflow.set_tag('model', 'xgboost')

        booster = xgb.train(
            params = params,
            dtrain = train,
            evals = [(validation, "validation")],
            num_boost_round = num_boost_round,
            early_stopping_rounds = early_stopping_rounds
        )
        
        y_pred = booster.predict(validation)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric('rmse', rmse)
        return {'loss': rmse, 'status': STATUS_OK}

In [None]:
grid_search = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child': hp.loguniform('min_child', -1, 3),
    'seed': 111,
    'objective': 'reg:linear'
}

In [None]:
best_model = fmin(
    fn=objective,
    space=grid_search,
    algo=tpe.suggest,
    max_evals=30,
    trials=Trials()
)

## Train the Best Model

In [None]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-data-experiment")

In [None]:
#We took the best params from the MLflow interface and copien them here

best_params = {
    'max_depth': 5,
    'min_child': 19.345653147972058,
    'objective': 'reg:linear',
    'reg_alpha': 0.031009193638004067,
    'reg_lambda': 0.013053945835415701,
    'seed': 111
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params = best_params,
    dtrain = train,
    evals = [(validation, "validation")],
    num_boost_round = 500,
    early_stopping_rounds = 50,
)

In [15]:

best_params = {
    'max_depth': 5,
    'min_child': 19.345653147972058,
    'objective': 'reg:linear',
    'reg_alpha': 0.031009193638004067,
    'reg_lambda': 0.013053945835415701,
    'seed': 111
}

In [22]:
key="seedfgg"
if best_params.get(key):
    print(best_params.get(key))
else:
    print("no value")

no value


In [29]:
pd.DataFrame.from_dict([best_params])

Unnamed: 0,max_depth,min_child,objective,reg_alpha,reg_lambda,seed
0,5,19.345653,reg:linear,0.031009,0.013054,111


In [None]:
y_pred = booster.predict(validation)

rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

## Model Logging 

In [None]:
import pickle

In [None]:
with open('models/moodel.bin', 'wb') as f_out:
    pickle.dump(booster, f_out)

In [None]:
with open('preprocessing/process_dataframe.bin', 'wb') as f_out:
    pickle.dump(process_dataframe, f_out)

In [None]:
mlflow.set_experiment("nyc-data-experiment")
with mlflow.start_run():
    best_params = {
    'max_depth': 5,
    'min_child': 19.345653147972058,
    'objective': 'reg:linear',
    'reg_alpha': 0.031009193638004067,
    'reg_lambda': 0.013053945835415701,
    'seed': 111
    }
    
    mlflow.log_params(best_params)
    mlflow.log_param('train_data_name', 'green_tripdata_2021-01.parquet')
    mlflow.log_param('validation_data_name', 'green_tripdata_2021-02.parquet')
    mlflow.set_tag('model', 'xgboost')
    
    booster = xgb.train(
    params = best_params,
    dtrain = train,
    evals = [(validation, "validation")],
    num_boost_round = 500,
    early_stopping_rounds = 50,
    )
    
    mlflow.xgboost.log_model(booster, artifact_path='mlflow_models')
    mlflow.log_artifact('preprocessing/process_dataframe.bin', artifact_path='preprocessing')
    

## Load Model

In [None]:
logged_model = 'runs:/b7a0f13c85694ac4b3615b03ed49444c/mlflow_models'
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [None]:
type(loaded_model)

In [None]:
y_preds = loaded_model.predict(X_val)

In [None]:
mean_squared_error(y_preds, y_val, squared=False)

In [None]:
y_preds

In [None]:
print(loaded_model.metadata.get_model_info())

## Sklearn Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
mlflow.sklearn.autolog()

for algorithm in (LinearSVR, RandomForestRegressor, GradientBoostingRegressor):
    with mlflow.start_run():
        mlflow.log_param('train_data_name', 'green_tripdata_2021-01.parquet')
        mlflow.log_param('validation_data_name', 'green_tripdata_2021-02.parquet')
        mlflow.log_artifact('preprocessing/process_dataframe.bin', artifact_path='preprocessing')
        model = algorithm()
        model.fit(X_train, y_train)
        
        preds = model.predict(X_val)
        rmse = mean_squared_error(preds, y_val, squared=False)
        mlflow.log_metric("rmse", rmse)
        

## MLflow Client

In [49]:
from mlflow.tracking import MlflowClient

In [50]:
MLFLOW_URI = "sqlite:///mlflow.db"

In [51]:
client = MlflowClient(MLFLOW_URI)

In [52]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-data-experiment', tags={}>]

In [53]:
client.create_experiment(name = 'new-experimet')

'2'

In [79]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-data-experiment', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='new-experimet', tags={}>]

In [55]:
from mlflow.entities import ViewType

In [56]:
?client.search_runs

In [73]:
runs = client.search_runs(
    experiment_ids='1',
    run_view_type=ViewType.ACTIVE_ONLY,
    filter_string='metrics.rmse < 7',
    max_results=5,
    order_by=["metrics.rmse ASC"],
)

In [76]:
for run in runs:
    print(f"run_id:{run.info.run_id}, metrics:{run.data.metrics['rmse']}")

run_id:b79ad0979b234d11a28224256a8730ba, metrics:5.161396262083112
run_id:90e0d6850cef402aa7b57ece26b05ca4, metrics:5.16352672201635
run_id:2cb7f7201b3840c5b209a8f22c29e580, metrics:5.173790146236089
run_id:547f118ce077413385a5065ac67a88ed, metrics:5.239710707379752
run_id:daa68af503514b00aa9dfc8a9c404366, metrics:5.247568537737903
