In [2]:
import os
from os import walk
import gc
import warnings
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import mlflow
import mlflow.sklearn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision
from tensorflow.keras.layers.experimental import preprocessing

import optuna

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    confusion_matrix,
    classification_report,
    accuracy_score
)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import lightgbm
from lightgbm import LGBMRegressor

In [3]:
#mixed_precision.set_global_policy('mixed_float16')
# physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Loading and processing data

In [4]:
df = pd.read_csv('project1_output.csv')

In [5]:
# we'll get a subset of our dataset in order to make experiments faster
df = df[:1000]
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

#### Converting cols to their appropriate types again because we lost it on the csv export

In [6]:
df["product_category_name"] = df["product_category_name"].astype('category')
df["order_status"] = df["order_status"].astype('category')
df["review_score"] = df["review_score"].astype('category')
df["payment_type"] = df["payment_type"].astype('category')
df["customer_zip_code_prefix"] = df["customer_zip_code_prefix"].astype('category')
df["customer_city"] = df["customer_city"].astype('category')
df["customer_state"] = df["customer_state"].astype('category')
df["seller_zip_code_prefix"] = df["seller_zip_code_prefix"].astype('category')
df["seller_city"] = df["seller_city"].astype('category')
df["seller_state"] = df["seller_state"].astype('category')

df["product_name_lenght"] = df["product_name_lenght"].astype('int64')
df["product_description_lenght"] = df["product_description_lenght"].astype('int64')
df["product_photos_qty"] = df["product_photos_qty"].astype('int64')
df["payment_installments"] = df["payment_installments"].astype('int64')
df["payment_sequential"] = df["payment_sequential"].astype('int64')

In [7]:
df.dtypes

order_id                           object
order_item_id                       int64
product_id                         object
seller_id                          object
shipping_limit_date                object
price                             float64
freight_value                     float64
product_category_name            category
product_name_lenght                 int64
product_description_lenght          int64
product_photos_qty                  int64
product_weight_g                  float64
product_length_cm                 float64
product_height_cm                 float64
product_width_cm                  float64
customer_id                        object
order_status                     category
order_purchase_timestamp           object
order_approved_at                  object
order_delivered_carrier_date       object
order_delivered_customer_date      object
order_estimated_delivery_date      object
review_id                          object
review_score                     c

#### Droping unnecessary columns
We only want to work with numerical values.

In [8]:
df = df.select_dtypes(exclude=['object'])

In [9]:
df.columns

Index(['order_item_id', 'price', 'freight_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'order_status', 'review_score',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'payment_value_norm', 'volume'],
      dtype='object')

We'll also drop the `payment_value` column because our model would simply infer our target value from it by subtracting it from the `price` column.

In [10]:
df = df.drop(columns=['payment_value'])

### Feature engineering

Here we'll one-hot encode all of our categorical columns, and then drop the original ones

In [11]:
df = pd.get_dummies(df)
df = df.select_dtypes(exclude=['category'])

Even though we generated over 22000 columns this way, we believe that our model will be powerful enough to filter out any unecessary data.

## Picking column for prediction

We chose the `freight_value` column so we can perform a regression in order to try to find it's value based on all of the columns we have available.

In [12]:
TARGET_VALUE = 'freight_value'

In [13]:
target_col = df[TARGET_VALUE]

In [14]:
target_col

0      13.29
1      19.93
2      17.87
3      12.79
4      18.14
       ...  
995    74.99
996    34.98
997    17.03
998    19.07
999    15.59
Name: freight_value, Length: 1000, dtype: float64

In [15]:
df = df.drop(columns=[TARGET_VALUE])

## Separating prediction and test data

We'll split our data in a 60/20/20 ratio.

In [16]:
def get_x_data():
    # input 
    train, val, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    
    return train, val, test

In [17]:
def get_y_data():
    # output
    train_labels, val_labels, test_labels = (
        np.split(
            target_col, 
            [int(.6*len(target_col)), int(.8*len(target_col))])
    )
    
    return train_labels, val_labels, test_labels

In [18]:
# deleting our initial df so we can free up some RAM
# del df

# Picking 4 ML algorithms

We'll use the following 4 algorithms:

1. Linear regression
2. Multilayer perceptron (a shallow one)
3. random forests
4. lightgbm/xgboost

### Metrics function

In [19]:
# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

### Enabling MLFlow autologging

In [20]:
mlflow.sklearn.autolog()
mlflow.tensorflow.autolog()
mlflow.lightgbm.autolog()



## Linear regression
Let's start off with linear regression, which is the most simple algorithm in our selection, and will serve as a baseline for the following algorithms.

In [21]:
def linear_regression(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run(run_name="Linear Regression"):
        reg = LinearRegression()
        reg.fit(train, train_labels)

        predictions = reg.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("Linear regression model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        #mlflow.sklearn.log_model(reg, "model")
        #modelpath = "./mlflow/freight_value/model-linear-reg"
        #mlflow.sklearn.save_model(reg, modelpath)
        
        gc.collect()
        
        return rmse

In [22]:
study = optuna.create_study()
study.optimize(linear_regression, n_trials=1)

[32m[I 2021-08-15 23:34:25,954][0m A new study created in memory with name: no-name-dab5cdf6-d3af-4a3d-bc5e-cd04a2250b40[0m
[32m[I 2021-08-15 23:34:26,940][0m Trial 0 finished with value: 90.99431207401365 and parameters: {}. Best is trial 0 with value: 90.99431207401365.[0m


Linear regression model
  RMSE: 90.99431207401365
  MAE: 65.309461906497
  R2: -38.3482098714733


## Multilayer Perceptron

In [23]:
def mlp(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    # hyper-parameters to test
    params = {
        "hidden_units": trial.suggest_int("hidden_units", 3, 15),
        "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
        "epochs": trial.suggest_int("epochs", 10, 50)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # Start an MLflow run
    with mlflow.start_run(run_name="MLP"):
        normalizer = preprocessing.Normalization(axis=-1)
        normalizer.adapt(np.array(train))
        
        mlp_model = tf.keras.Sequential([
            normalizer,
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=1),
        ])

        mlp_model.summary()
        
        mlp_model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=params["lr"]),
            loss='mean_squared_error'
        )

        history = mlp_model.fit(
            train, train_labels,
            validation_data=(test, test_labels),
            epochs=params["epochs"]
        )
        
        predictions = mlp_model.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("MLP model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_name":"MultiLayerPerceptron",
                "estimator_class":"Keras"
            }
        )
        #mlflow.tensorflow.log_model(mlp_model, "model")
        #modelpath = "./mlflow/freight_value/model-mlp"
        #mlflow.tensorflow.save_model(mlp_model, modelpath)
        tf.keras.backend.clear_session()

        gc.collect()
        
        return rmse

### Using optuna to optimize MLP's hyperparameters

In [24]:
study = optuna.create_study()
study.optimize(mlp, n_trials=10)

[32m[I 2021-08-15 23:34:26,967][0m A new study created in memory with name: no-name-3da30d36-2ebb-49f4-a393-e89192289a9f[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 13)                24128     
_________________________________________________________________
dense_1 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_2 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 14        
Total params: 28,217
Trainable params: 24,506
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/48
Epoch 2/48
Epoch 3/48
Epoch 4/48
Epoch 5/48
Epoch 6/48
Epoch 7/48
Epoch 8/48
Epoch 9/48
Epoch 10/48
Epo

[32m[I 2021-08-15 23:34:32,260][0m Trial 0 finished with value: 762881.3161777382 and parameters: {'hidden_units': 13, 'lr': 1.2645740678991583e-05, 'epochs': 48}. Best is trial 0 with value: 762881.3161777382.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 8)                 14848     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 18,712
Trainable params: 15,001
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/47
Epoch 2/47
Epoch 3/47
Epoch 4/47
Epoch 5/47
Epoch 6/47
Epoch 7/47
Epoch 8/47
Epoch 9/47
Epoch 10/47
Epo

[32m[I 2021-08-15 23:34:36,799][0m Trial 1 finished with value: 243418.12581651183 and parameters: {'hidden_units': 8, 'lr': 1.3018700865964896e-05, 'epochs': 47}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 7)                 12992     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 8         
Total params: 16,823
Trainable params: 13,112
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epo

[32m[I 2021-08-15 23:34:40,388][0m Trial 2 finished with value: 532486.7892691911 and parameters: {'hidden_units': 7, 'lr': 1.921014348470924e-05, 'epochs': 30}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 11)                20416     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 24,403
Trainable params: 20,692
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/49
Epoch 2/49
Epoch 3/49
Epoch 4/49
Epoch 5/49
Epoch 6/49
Epoch 7/49
Epoch 8/49
Epoch 9/49
Epoch 10/49
Epo

[32m[I 2021-08-15 23:34:45,176][0m Trial 3 finished with value: 731815.3302318369 and parameters: {'hidden_units': 11, 'lr': 6.693371085081771e-05, 'epochs': 49}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 3)                 5568      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 4         
Total params: 9,307
Trainable params: 5,596
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/28
Epoch 2/28
Epoch 3/28
Epoch 4/28
Epoch 5/28
Epoch 6/28
Epoch 7/28
Epoch 8/28
Epoch 9/28
Epoch 10/28
Epoch

[32m[I 2021-08-15 23:34:48,813][0m Trial 4 finished with value: 656167.3836285687 and parameters: {'hidden_units': 3, 'lr': 9.128537121695166e-05, 'epochs': 28}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 6)                 11136     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 14,938
Trainable params: 11,227
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epo

[32m[I 2021-08-15 23:34:52,789][0m Trial 5 finished with value: 409633.76713693945 and parameters: {'hidden_units': 6, 'lr': 1.439895271967439e-05, 'epochs': 35}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 11)                20416     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 24,403
Trainable params: 20,692
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epo

[32m[I 2021-08-15 23:34:56,060][0m Trial 6 finished with value: 568249.6463792608 and parameters: {'hidden_units': 11, 'lr': 0.00012740202766212662, 'epochs': 24}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 7)                 12992     
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 8         
Total params: 16,823
Trainable params: 13,112
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INF

[32m[I 2021-08-15 23:34:58,486][0m Trial 7 finished with value: 414396.67002250714 and parameters: {'hidden_units': 7, 'lr': 0.0001882715981204829, 'epochs': 10}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 10)                18560     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 22,502
Trainable params: 18,791
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epo

[32m[I 2021-08-15 23:35:02,935][0m Trial 8 finished with value: 587205.4544958103 and parameters: {'hidden_units': 10, 'lr': 4.871522981572731e-05, 'epochs': 45}. Best is trial 1 with value: 243418.12581651183.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 5)                 9280      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 6         
Total params: 13,057
Trainable params: 9,346
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoc

[32m[I 2021-08-15 23:35:06,035][0m Trial 9 finished with value: 1198343.9028469082 and parameters: {'hidden_units': 5, 'lr': 0.0004429587786509501, 'epochs': 22}. Best is trial 1 with value: 243418.12581651183.[0m


## Random Forest

In [25]:
def random_forest(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run(run_name="Random Forest"):
        rf = RandomForestRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
            min_samples_split=params["min_samples_split"],
            random_state=0
        )
        rf.fit(train, train_labels)
        
        predictions = rf.predict(val)
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)
        
        print("Random Forest model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Random Forest's hyperparameters

In [26]:
study = optuna.create_study()
study.optimize(random_forest, n_trials=10)

[32m[I 2021-08-15 23:35:06,054][0m A new study created in memory with name: no-name-a12e8918-f381-4cd1-af42-eded737eec6d[0m
[32m[I 2021-08-15 23:35:07,671][0m Trial 0 finished with value: 15.028769774006358 and parameters: {'n_estimators': 134, 'max_depth': 4, 'min_samples_split': 4}. Best is trial 0 with value: 15.028769774006358.[0m


Random Forest model
  RMSE: 15.028769774006358
  MAE: 9.24944312378477
  R2: -0.07335491700964591


[32m[I 2021-08-15 23:35:09,357][0m Trial 1 finished with value: 15.616548157282168 and parameters: {'n_estimators': 83, 'max_depth': 9, 'min_samples_split': 5}. Best is trial 0 with value: 15.028769774006358.[0m


Random Forest model
  RMSE: 15.616548157282168
  MAE: 9.487153705419797
  R2: -0.15895500840076604


[32m[I 2021-08-15 23:35:10,631][0m Trial 2 finished with value: 15.302392666498529 and parameters: {'n_estimators': 61, 'max_depth': 7, 'min_samples_split': 2}. Best is trial 0 with value: 15.028769774006358.[0m


Random Forest model
  RMSE: 15.302392666498529
  MAE: 9.34541157704586
  R2: -0.11279501375190004


[32m[I 2021-08-15 23:35:12,477][0m Trial 3 finished with value: 15.679531718904316 and parameters: {'n_estimators': 89, 'max_depth': 10, 'min_samples_split': 2}. Best is trial 0 with value: 15.028769774006358.[0m


Random Forest model
  RMSE: 15.679531718904316
  MAE: 9.502150244145167
  R2: -0.16832229146841438


[32m[I 2021-08-15 23:35:14,356][0m Trial 4 finished with value: 15.244090230273448 and parameters: {'n_estimators': 132, 'max_depth': 6, 'min_samples_split': 3}. Best is trial 0 with value: 15.028769774006358.[0m


Random Forest model
  RMSE: 15.244090230273448
  MAE: 9.315441265976306
  R2: -0.10433162280311103


[32m[I 2021-08-15 23:35:15,849][0m Trial 5 finished with value: 14.912817434180642 and parameters: {'n_estimators': 141, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 5 with value: 14.912817434180642.[0m


Random Forest model
  RMSE: 14.912817434180642
  MAE: 9.20049717834161
  R2: -0.056856175344866866


[32m[I 2021-08-15 23:35:17,307][0m Trial 6 finished with value: 15.583816889660739 and parameters: {'n_estimators': 61, 'max_depth': 9, 'min_samples_split': 5}. Best is trial 5 with value: 14.912817434180642.[0m


Random Forest model
  RMSE: 15.583816889660739
  MAE: 9.492343794300764
  R2: -0.154101911225468
Random Forest model
  RMSE: 14.869604418574337
  MAE: 9.190870189117215
  R2: -0.050740124708194445


[32m[I 2021-08-15 23:35:18,658][0m Trial 7 finished with value: 14.869604418574337 and parameters: {'n_estimators': 89, 'max_depth': 3, 'min_samples_split': 2}. Best is trial 7 with value: 14.869604418574337.[0m
[32m[I 2021-08-15 23:35:20,508][0m Trial 8 finished with value: 15.641466298625234 and parameters: {'n_estimators': 91, 'max_depth': 9, 'min_samples_split': 4}. Best is trial 7 with value: 14.869604418574337.[0m


Random Forest model
  RMSE: 15.641466298625234
  MAE: 9.476235779929917
  R2: -0.16265647231419078


[32m[I 2021-08-15 23:35:21,669][0m Trial 9 finished with value: 14.989840814110146 and parameters: {'n_estimators': 74, 'max_depth': 4, 'min_samples_split': 2}. Best is trial 7 with value: 14.869604418574337.[0m


Random Forest model
  RMSE: 14.989840814110146
  MAE: 9.24752355687862
  R2: -0.06780150526396178


## Gradient Boosting with LightGBM

In [28]:
def gradient_boosting(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "num_leaves": trial.suggest_int("num_leaves", 25, 35),
        "max_depth": trial.suggest_int("max_depth", 3, 10)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run(run_name="Gradient Boosting"):
#         model = LGBMRegressor(
#             max_depth=params["max_depth"],
#             n_estimators=params["n_estimators"],
#             num_leaves=params["num_leaves"],
#         )
        model = XGBRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
        )
        model.fit(train, train_labels)
        
        predictions = model.predict(test)
        print('Prediction: %.3f' % predictions[0])
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        print("LGBM model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_class":"LightGBM",
                "estimator_name":"Gradient Boosting"
            }
        )
        mlflow.sklearn.log_model(model, "model")
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Gradient Boosting's hyperparameters

In [29]:
study = optuna.create_study()
study.optimize(gradient_boosting, n_trials=10)

[32m[I 2021-08-15 23:36:06,187][0m A new study created in memory with name: no-name-b6c9b334-f886-4dd1-b657-7d00750392df[0m
[32m[I 2021-08-15 23:36:09,574][0m Trial 0 finished with value: 16.125656800791422 and parameters: {'n_estimators': 110, 'num_leaves': 32, 'max_depth': 5}. Best is trial 0 with value: 16.125656800791422.[0m


Prediction: 26.947
LGBM model
  RMSE: 16.125656800791422
  MAE: 10.054394902801514
  R2: -0.23575197196477937


[32m[I 2021-08-15 23:36:12,045][0m Trial 1 finished with value: 15.68434129076245 and parameters: {'n_estimators': 83, 'num_leaves': 29, 'max_depth': 4}. Best is trial 1 with value: 15.68434129076245.[0m


Prediction: 20.498
LGBM model
  RMSE: 15.68434129076245
  MAE: 9.609959377098084
  R2: -0.16903914856935187


[32m[I 2021-08-15 23:36:13,564][0m Trial 2 finished with value: 15.294295479565868 and parameters: {'n_estimators': 57, 'num_leaves': 35, 'max_depth': 3}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 18.420
LGBM model
  RMSE: 15.294295479565868
  MAE: 9.32500130119324
  R2: -0.11161766514939697
Prediction: 27.618
LGBM model
  RMSE: 16.950889388448477
  MAE: 10.797867583084107
  R2: -0.365467811661512


[32m[I 2021-08-15 23:36:20,882][0m Trial 3 finished with value: 16.950889388448477 and parameters: {'n_estimators': 119, 'num_leaves': 25, 'max_depth': 8}. Best is trial 2 with value: 15.294295479565868.[0m
[32m[I 2021-08-15 23:36:26,684][0m Trial 4 finished with value: 16.85151561892738 and parameters: {'n_estimators': 143, 'num_leaves': 32, 'max_depth': 7}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 36.101
LGBM model
  RMSE: 16.85151561892738
  MAE: 10.72845723590851
  R2: -0.3495047624625385


[32m[I 2021-08-15 23:36:28,683][0m Trial 5 finished with value: 15.73250800796823 and parameters: {'n_estimators': 60, 'num_leaves': 25, 'max_depth': 5}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 24.977
LGBM model
  RMSE: 15.73250800796823
  MAE: 9.729811142158509
  R2: -0.17623042799174415


[32m[I 2021-08-15 23:36:31,599][0m Trial 6 finished with value: 16.071414224059467 and parameters: {'n_estimators': 95, 'num_leaves': 27, 'max_depth': 5}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 25.012
LGBM model
  RMSE: 16.071414224059467
  MAE: 9.953289933586122
  R2: -0.22745244835557177


[32m[I 2021-08-15 23:36:34,660][0m Trial 7 finished with value: 16.096463090082608 and parameters: {'n_estimators': 100, 'num_leaves': 35, 'max_depth': 5}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 26.178
LGBM model
  RMSE: 16.096463090082608
  MAE: 10.0064317401886
  R2: -0.23128163874871466


[32m[I 2021-08-15 23:36:43,535][0m Trial 8 finished with value: 16.878638207733626 and parameters: {'n_estimators': 140, 'num_leaves': 34, 'max_depth': 10}. Best is trial 2 with value: 15.294295479565868.[0m


Prediction: 30.312
LGBM model
  RMSE: 16.878638207733626
  MAE: 10.67378908352852
  R2: -0.3538523261100506
Prediction: 30.309
LGBM model
  RMSE: 16.875874308867214
  MAE: 10.67277538971901
  R2: -0.35340897226929746


[32m[I 2021-08-15 23:36:52,312][0m Trial 9 finished with value: 16.875874308867214 and parameters: {'n_estimators': 139, 'num_leaves': 35, 'max_depth': 10}. Best is trial 2 with value: 15.294295479565868.[0m


## Selecting best model