# Using Hyperopt

In [1]:
# Standard imports
import numpy as np
import pandas as pd
from pydantic import BaseModel, ValidationError
import yaml

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# Built-in library
import itertools
import re
import json
import typing as tp
import logging

import warnings

warnings.filterwarnings("error")

# for saving the pipeline
import joblib

# MLFlow
import mlflow

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics, set_config

# Pipeline Display
set_config(display="text")

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures

# Custom Imports
from data_manager import load_data, validate_input
import feat_engineering as fe
from schema import (
    InputSchema,
    ValidateTrainingData,
    ModelConfig,
    MLFlowConfig,
    ConfigVars,
)
import utilities as util

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Load Data
train_data = load_data("data/yellow_tripdata_2022-01.parquet")
test_data = load_data("data/yellow_tripdata_2022-02.parquet")

print(f"Shape of: \ntrain_data: {train_data.shape}\ntest_data: {test_data.shape}\n")

train_data.head()

Shape of: 
train_data: (2406155, 20)
test_data: (2901257, 20)



Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,trip_duration
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0,2.93492
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0,2.24071
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0,2.299581
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0,2.400619
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0,3.651437


### Load Config

In [3]:
fp = "config.yml"

with open(fp, "r") as file:
    config_file = yaml.load(stream=file, Loader=yaml.loader.SafeLoader)

config = ConfigVars(
    model_config=ModelConfig(**config_file),
    mlflow_config=MLFlowConfig(**config_file),
)

In [4]:
# Split the data
X = train_data.drop(columns=[config.model_config.TARGET])
y = train_data[config.model_config.TARGET]

X_train, X_validate, y_train, y_validate = train_test_split(
    X,
    y,
    test_size=config.model_config.TEST_SIZE,
    random_state=config.model_config.RANDOM_STATE,
)

X_train.shape, X_validate.shape

((2165539, 19), (240616, 19))

In [5]:
pipe = Pipeline(
    steps=[
        # ===== Select input features =====
        (
            "input vars",
            fe.SelectFeatures(features=config.model_config.INPUT_FEATURES),
        ),
        # ===== Add NaN flags =====
        (
            "add na_flag",
            AddMissingIndicator(
                missing_only=True, variables=config.model_config.NUM_VARS_WF_NA
            ),
        ),
        # ===== Impute NaNs =====
        (
            "impute num_vars",
            MeanMedianImputer(
                imputation_method="median", variables=config.model_config.NUM_VARS_WF_NA
            ),
        ),
        # ===== Create new features =====
        (
            "cal day_of_week",
            fe.CalculateDayOfWeek(feature=config.model_config.TEMPORAL_VAR),
        ),
        (
            "cal hour_of_day",
            fe.CalculateHourOfDay(feature=config.model_config.TEMPORAL_VAR),
        ),
        # ===== Select features =====
        (
            "important vars",
            fe.SelectFeatures(features=config.model_config.IMPORTANT_FEATURES),
        ),
        # ===== Drop features =====
        ("drop vars", DropFeatures(features_to_drop=config.model_config.VARS_TO_DROP)),
        # ===== Transform features =====
        (
            "log transformation",
            LogTransformer(
                variables=config.model_config.VARS_TO_LOG_TRANSFORM, base="e"
            ),
        ),
        # ===== Scale features =====
        ("scale data", StandardScaler()),
        # ===== Linear model =====
        ("linear model", LinearRegression()),
    ]
)

pipe

Pipeline(steps=[('input vars',
                 SelectFeatures(features=['DOLocationID', 'payment_type',
                                          'PULocationID', 'RatecodeID',
                                          'total_amount',
                                          'tpep_pickup_datetime',
                                          'trip_distance', 'VendorID'])),
                ('add na_flag', AddMissingIndicator(variables=['RatecodeID'])),
                ('impute num_vars',
                 MeanMedianImputer(variables=['RatecodeID'])),
                ('cal day_of_week',
                 CalculateDayOfWeek(feature='...
                                          'hour_of_day', 'payment_type',
                                          'PULocationID', 'RatecodeID',
                                          'RatecodeID_na', 'total_amount',
                                          'tpep_pickup_datetime',
                                          'trip_distance', 'VendorID'])),
 

In [6]:
warnings.filterwarnings("ignore")  # Required

delim = "::"
format_ = f"%(levelname)s {delim} %(asctime)s {delim} %(message)s"
logging.basicConfig(level=logging.INFO, format=format_)
logger = logging.getLogger(__name__)

In [7]:
logger.info("===== Loading Processor Pipeline =====")

# Load Preprocessing Pipeline
fp = "models/processor_pipeline.joblib"

with open(fp, "rb") as file:
    preprocessing_pipeline = joblib.load(filename=file)

logger.info("===== Loading Processor Pipeline Done!=====")

INFO :: 2022-12-17 22:31:57,333 :: ===== Loading Processor Pipeline =====
INFO :: 2022-12-17 22:31:57,337 :: ===== Loading Processor Pipeline Done!=====


In [8]:
columns = [
    "day_of_week",
    "DOLocationID",
    "hour_of_day",
    "payment_type",
    "PULocationID",
    "RatecodeID",
    "RatecodeID_na",
    "total_amount",
    "trip_distance",
    "VendorID",
]
transformed_X = preprocessing_pipeline.transform(X)
transformed_X = pd.DataFrame(transformed_X, columns=columns)

transformed_X.head(3)

Unnamed: 0,day_of_week,DOLocationID,hour_of_day,payment_type,PULocationID,RatecodeID,RatecodeID_na,total_amount,trip_distance,VendorID
0,5,236,5,1,142,1.0,0,3.088767,1.335001,1
1,5,42,5,1,236,1.0,0,2.587764,0.741937,1
2,5,166,5,1,166,1.0,0,2.357073,-0.030459,2


## Using Hyperopt

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope

#### hp.quniform

```Python
hp.quniform(label, low, high, q)

```

```
Returns a value like round(uniform(low, high) / q) * q
Suitable for a discrete value with respect to which the objective is 
still somewhat "smooth", but which should be bounded both above and below.
```

```python
"max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
```

In [11]:
# Defining Objective function whose loss we have to minimize
def objective(args):

    if args.get("type") == "rf_model":
        clf = RandomForestRegressor()

    elif args.get("type") == "linear_model":
        clf = LinearRegression()

    clf.set_params(**args["params"])

    score = cross_val_score(clf, transformed_X, y, cv=5, n_jobs=-1, error_score=0.99)
    return {"loss": 1 - np.median(score), "status": STATUS_OK}

In [12]:
# Defining Search Space
space = hp.choice(
    "classifiers",
    [
        {"type": "linear_model", "params": {}},
        {
            "type": "rf_model",
            "params": {
                "max_depth": hp.choice("max_depth", range(4, 15)),
                "criterion": hp.choice("criterion", ["squared_error"]),
            },
        },
    ],
)

In [13]:
exp_dict = {
    "experiment_name": "hyperopt",
    "run_name": "003",
    "model_name": "multi-models",
    "tracking_uri": "sqlite:///mlflow.db",
}

exp_details = util.Experiment(**exp_dict)

exp_details

Experiment(experiment_name='hyperopt', run_name='003', model_name='multi-models', tracking_uri='sqlite:///mlflow.db')

In [14]:
# Putting it together
best_classifier = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials(),
)
# mlflow.sklearn.log_model(estimator, "model")

  0%|                                                                               | 0/10 [00:00<?, ?trial/s, best loss=?]

INFO :: 2022-12-17 22:33:41,169 :: build_posterior_wrapper took 0.002681 seconds
INFO :: 2022-12-17 22:33:41,170 :: TPE using 0 trials


 10%|█████                                             | 1/10 [14:56<2:14:24, 896.09s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:48:37,259 :: build_posterior_wrapper took 0.002706 seconds
INFO :: 2022-12-17 22:48:37,261 :: TPE using 1/1 trials with best loss 0.085558


 20%|██████████▍                                         | 2/10 [15:01<49:37, 372.24s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:48:42,810 :: build_posterior_wrapper took 0.001385 seconds
INFO :: 2022-12-17 22:48:42,811 :: TPE using 2/2 trials with best loss 0.085558


 30%|███████████████▌                                    | 3/10 [15:06<23:49, 204.27s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:48:47,193 :: build_posterior_wrapper took 0.001050 seconds
INFO :: 2022-12-17 22:48:47,194 :: TPE using 3/3 trials with best loss 0.085558


 40%|████████████████████▊                               | 4/10 [15:10<12:32, 125.43s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:48:51,751 :: build_posterior_wrapper took 0.001230 seconds
INFO :: 2022-12-17 22:48:51,752 :: TPE using 4/4 trials with best loss 0.085558


 50%|██████████████████████████                          | 5/10 [25:56<26:05, 313.12s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:59:37,683 :: build_posterior_wrapper took 0.001473 seconds
INFO :: 2022-12-17 22:59:37,684 :: TPE using 5/5 trials with best loss 0.085558


 60%|███████████████████████████████▏                    | 6/10 [26:01<13:52, 208.19s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:59:42,189 :: build_posterior_wrapper took 0.001222 seconds
INFO :: 2022-12-17 22:59:42,190 :: TPE using 6/6 trials with best loss 0.085558


 70%|████████████████████████████████████▍               | 7/10 [26:05<07:05, 141.73s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 22:59:47,082 :: build_posterior_wrapper took 0.001202 seconds
INFO :: 2022-12-17 22:59:47,083 :: TPE using 7/7 trials with best loss 0.085558


 80%|█████████████████████████████████████████▌          | 8/10 [37:10<10:16, 308.31s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 23:10:52,057 :: build_posterior_wrapper took 0.002366 seconds
INFO :: 2022-12-17 23:10:52,057 :: TPE using 8/8 trials with best loss 0.085558


 90%|██████████████████████████████████████████████▊     | 9/10 [37:15<03:33, 213.28s/trial, best loss: 0.0855583212338894]

INFO :: 2022-12-17 23:10:56,406 :: build_posterior_wrapper took 0.011352 seconds
INFO :: 2022-12-17 23:10:56,407 :: TPE using 9/9 trials with best loss 0.085558


100%|███████████████████████████████████████████████████| 10/10 [48:36<00:00, 291.60s/trial, best loss: 0.0855583212338894]


In [15]:
best_params = space_eval(space, best_classifier)

# Getting the best Model
best_params.get("params")

{'criterion': 'squared_error', 'max_depth': 13}

### Train And Track The Experiment With MLFlow

In [16]:
# Split the data
X = transformed_X.copy()
y = train_data[config.model_config.TARGET]

X_train, X_validate, y_train, y_validate = train_test_split(
    X,
    y,
    test_size=config.model_config.TEST_SIZE,
    random_state=config.model_config.RANDOM_STATE,
)

X_train.shape, X_validate.shape

((2165539, 10), (240616, 10))

In [19]:
training_data = util.TrainingData(
    X_train=X_train, X_validate=X_validate, y_train=y_train, y_validate=y_validate
)
clf = RandomForestRegressor(**best_params.get("params"))

exp_dict = {
    "experiment_name": "new_experiment",
    "run_name": "004",
    "model_name": "rf_model",
    "tracking_uri": "sqlite:///mlflow.db",
}

exp_details = util.Experiment(**exp_dict)

exp_details

Experiment(experiment_name='new_experiment', run_name='004', model_name='rf_model', tracking_uri='sqlite:///mlflow.db')

In [21]:
util.run_experiment(experiment=exp_details, estimator=clf, training_data=training_data)

INFO :: 2022-12-17 23:23:12,304 :: Context impl SQLiteImpl.
INFO :: 2022-12-17 23:23:12,305 :: Will assume non-transactional DDL.
2022/12/17 23:23:12 INFO mlflow.tracking.fluent: Experiment with name 'new_experiment' does not exist. Creating a new experiment.


 Model name: rf_model
  RMSE: 0.18995432746166002
  MSE: 0.03608264652141156
  MAE: 0.1334863171800785
  R2: 0.9052024409063282


Successfully registered model 'rf_model'.
2022/12/17 23:36:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: rf_model, version 1
Created version '1' of model 'rf_model'.
