In [1]:
import os
from os import walk
import gc
import warnings
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision
from tensorflow.keras.layers.experimental import preprocessing

import optuna

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    confusion_matrix,
    classification_report,
    accuracy_score
)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import lightgbm
from lightgbm import LGBMRegressor

In [2]:
#mixed_precision.set_global_policy('mixed_float16')
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Loading and processing data

In [3]:
df = pd.read_csv('project1_output.csv')

In [4]:
# we'll get a subset of our dataset in order to make experiments faster
#df = df[:1000]

#### Converting cols to their appropriate types again because we lost it on the csv export

In [5]:
df["product_category_name"] = df["product_category_name"].astype('category')
df["order_status"] = df["order_status"].astype('category')
df["review_score"] = df["review_score"].astype('category')
df["payment_type"] = df["payment_type"].astype('category')
df["customer_zip_code_prefix"] = df["customer_zip_code_prefix"].astype('category')
df["customer_city"] = df["customer_city"].astype('category')
df["customer_state"] = df["customer_state"].astype('category')
df["seller_zip_code_prefix"] = df["seller_zip_code_prefix"].astype('category')
df["seller_city"] = df["seller_city"].astype('category')
df["seller_state"] = df["seller_state"].astype('category')

df["product_name_lenght"] = df["product_name_lenght"].astype('int64')
df["product_description_lenght"] = df["product_description_lenght"].astype('int64')
df["product_photos_qty"] = df["product_photos_qty"].astype('int64')
df["payment_installments"] = df["payment_installments"].astype('int64')
df["payment_sequential"] = df["payment_sequential"].astype('int64')

In [6]:
df.dtypes

order_id                           object
order_item_id                       int64
product_id                         object
seller_id                          object
shipping_limit_date                object
price                             float64
freight_value                     float64
product_category_name            category
product_name_lenght                 int64
product_description_lenght          int64
product_photos_qty                  int64
product_weight_g                  float64
product_length_cm                 float64
product_height_cm                 float64
product_width_cm                  float64
customer_id                        object
order_status                     category
order_purchase_timestamp           object
order_approved_at                  object
order_delivered_carrier_date       object
order_delivered_customer_date      object
order_estimated_delivery_date      object
review_id                          object
review_score                     c

#### Droping unnecessary columns
We only want to work with numerical values.

In [7]:
df = df.select_dtypes(exclude=['object'])

In [8]:
df.columns

Index(['order_item_id', 'price', 'freight_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'order_status', 'review_score',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'payment_value_norm', 'volume'],
      dtype='object')

We'll also drop the `payment_value` column because our model would simply infer our target value from it by subtracting it from the `price` column.

In [9]:
df = df.drop(columns=['payment_value'])

### Feature engineering

Here we'll one-hot encode all of our categorical columns, and then drop the original ones

In [10]:
df = pd.get_dummies(df)
df = df.select_dtypes(exclude=['category'])

Even though we generated over 22000 columns this way, we believe that our model will be powerful enough to filter out any unecessary data.

## Picking column for prediction

We chose the `freight_value` column so we can perform a regression in order to try to find it's value based on all of the columns we have available.

In [11]:
TARGET_VALUE = 'freight_value'

In [12]:
target_col = df[TARGET_VALUE]

In [13]:
target_col

0         13.29
1         19.93
2         17.87
3         12.79
4         18.14
          ...  
118290    43.41
118291    36.53
118292    16.95
118293     8.72
118294    12.79
Name: freight_value, Length: 118295, dtype: float64

In [14]:
df = df.drop(columns=[TARGET_VALUE])

## Separating prediction and test data

We'll split our data in a 60/20/20 ratio.

In [15]:
# input 
train, val, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

# output
train_labels, val_labels, test_labels = (
    np.split(
        target_col, 
        [int(.6*len(target_col)), int(.8*len(target_col))])
)

In [16]:
# deleting our initial df so we can free up some RAM
del df

# Picking 4 ML algorithms

We'll use the following 4 algorithms:

1. Linear regression
2. Multilayer perceptron (a shallow one)
3. random forests
4. lightgbm/xgboost

### Metrics function

In [17]:
# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

### Enabling MLFlow autologging

In [18]:
mlflow.sklearn.autolog()
mlflow.tensorflow.autolog()
mlflow.lightgbm.autolog()

## Linear regression
Let's start off with linear regression, which is the most simple algorithm in our selection, and will serve as a baseline for the following algorithms.

In [19]:
def linear_regression(trial):
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run():
        reg = LinearRegression()
        reg.fit(train, train_labels)

        predictions = reg.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("Linear regression model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        #mlflow.sklearn.log_model(reg, "model")
        #modelpath = "./mlflow/freight_value/model-linear-reg"
        #mlflow.sklearn.save_model(reg, modelpath)
        
        gc.collect()
        
        return rmse

In [20]:
study = optuna.create_study()
study.optimize(linear_regression, n_trials=1)

[32m[I 2021-08-15 13:00:10,409][0m A new study created in memory with name: no-name-a48463a4-b709-4ee6-835e-680b420e836a[0m
[32m[I 2021-08-15 13:48:40,186][0m Trial 0 finished with value: 10522467.054732079 and parameters: {}. Best is trial 0 with value: 10522467.054732079.[0m


Linear regression model
  RMSE: 10522467.054732079
  MAE: 829174.278560314
  R2: -432433458073.49365


## Multilayer Perceptron

In [21]:
def mlp(trial):
    # hyper-parameters to test
    params = {
        "hidden_units": trial.suggest_int("hidden_units", 3, 15),
        "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
        "epochs": trial.suggest_int("epochs", 10, 50)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # Start an MLflow run
    with mlflow.start_run():
        normalizer = preprocessing.Normalization(axis=-1)
        normalizer.adapt(np.array(train))
        
        mlp_model = tf.keras.Sequential([
            normalizer,
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=1),
        ])

        mlp_model.summary()
        
        mlp_model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=params["lr"]),
            loss='mean_squared_error'
        )

        history = mlp_model.fit(
            train, train_labels,
            validation_data=(test, test_labels),
            epochs=params["epochs"]
        )
        
        predictions = mlp_model.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("MLP model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_name":"MultiLayerPerceptron",
                "estimator_class":"Keras"
            }
        )
        #mlflow.tensorflow.log_model(mlp_model, "model")
        #modelpath = "./mlflow/freight_value/model-mlp"
        #mlflow.tensorflow.save_model(mlp_model, modelpath)
        tf.keras.backend.clear_session()

        gc.collect()
        
        return rmse

### Using optuna to optimize MLP's hyperparameters

In [22]:
study = optuna.create_study()
study.optimize(mlp, n_trials=10)

[32m[I 2021-08-15 13:48:40,193][0m A new study created in memory with name: no-name-e6812a48-272e-41c7-9619-8305691fca36[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 6)                 132570    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 7         
Total params: 176,850
Trainable params: 132,661
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35


[32m[I 2021-08-15 13:53:14,323][0m Trial 0 finished with value: 29070.070794112642 and parameters: {'hidden_units': 6, 'lr': 0.00013783166250126664, 'epochs': 35}. Best is trial 0 with value: 29070.070794112642.[0m


MLP model
  RMSE: 29070.070794112642
  MAE: 5329.234663779358
  R2: -3300473.017016375
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 9)                 198855    
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 10        
Total params: 243,234
Trainable params: 199,045
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/28
Epoch 2/28
Ep

[32m[I 2021-08-15 13:57:24,403][0m Trial 1 finished with value: 45358.496752358995 and parameters: {'hidden_units': 9, 'lr': 6.593713512987263e-05, 'epochs': 28}. Best is trial 0 with value: 29070.070794112642.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 3)                 66285     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 4         
Total params: 110,502
Trainable params: 66,313
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/49
Epoch 2/49
Epoch 3/49
Epoch 4/49
Epoch 5/49
Epoch 6/49
Epoch 7/49
Epoch 8/49
Epoch 9/49
Epoch 10/49
E

[32m[I 2021-08-15 14:03:35,261][0m Trial 2 finished with value: 151125.27687811205 and parameters: {'hidden_units': 3, 'lr': 2.097040663215729e-05, 'epochs': 49}. Best is trial 0 with value: 29070.070794112642.[0m


MLP model
  RMSE: 151125.27687811205
  MAE: 27749.002557147298
  R2: -89198664.79674345
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 11)                243045    
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 287,510
Trainable params: 243,321
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/25
Epoch 2/25
E

[32m[I 2021-08-15 14:06:55,356][0m Trial 3 finished with value: 9273.360933062278 and parameters: {'hidden_units': 11, 'lr': 0.0009227598608909978, 'epochs': 25}. Best is trial 3 with value: 9273.360933062278.[0m


MLP model
  RMSE: 9273.360933062278
  MAE: 1782.856692734742
  R2: -335859.14126271976
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 5)                 110475    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 30        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 6         
Total params: 154,730
Trainable params: 110,541
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Ep

[32m[I 2021-08-15 14:10:49,679][0m Trial 4 finished with value: 145372.27099876106 and parameters: {'hidden_units': 5, 'lr': 1.8466579518334734e-05, 'epochs': 30}. Best is trial 3 with value: 9273.360933062278.[0m


MLP model
  RMSE: 145372.27099876106
  MAE: 26760.511192538313
  R2: -82536734.82604657
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 4)                 88380     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total params: 132,614
Trainable params: 88,425
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Ep

[32m[I 2021-08-15 14:14:09,425][0m Trial 5 finished with value: 8578.914172656961 and parameters: {'hidden_units': 4, 'lr': 0.0007616910471088858, 'epochs': 25}. Best is trial 5 with value: 8578.914172656961.[0m


MLP model
  RMSE: 8578.914172656961
  MAE: 1564.2424523670736
  R2: -287440.0464267149
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 11)                243045    
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 287,510
Trainable params: 243,321
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/12
Epoch 2/12
Ep

[32m[I 2021-08-15 14:16:00,255][0m Trial 6 finished with value: 8939.52775311599 and parameters: {'hidden_units': 11, 'lr': 0.0007082725917421973, 'epochs': 12}. Best is trial 5 with value: 8578.914172656961.[0m


MLP model
  RMSE: 8939.52775311599
  MAE: 1619.947932606869
  R2: -312113.0314446522
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 12)                265140    
_________________________________________________________________
dense_1 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_2 (Dense)              (None, 12)                156       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 13        
Total params: 309,654
Trainable params: 265,465
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/23
Epoch 2/23
Epoc

[32m[I 2021-08-15 14:19:05,099][0m Trial 7 finished with value: 22629.200746911454 and parameters: {'hidden_units': 12, 'lr': 0.00015603515983063726, 'epochs': 23}. Best is trial 5 with value: 8578.914172656961.[0m


MLP model
  RMSE: 22629.200746911454
  MAE: 4114.311136843061
  R2: -1999964.8019575395
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 7)                 154665    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 8         
Total params: 198,974
Trainable params: 154,785
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/49
Epoch 2/49
E

[32m[I 2021-08-15 14:25:11,041][0m Trial 8 finished with value: 67290.53372148509 and parameters: {'hidden_units': 7, 'lr': 1.0473562849069252e-05, 'epochs': 49}. Best is trial 5 with value: 8578.914172656961.[0m


MLP model
  RMSE: 67290.53372148509
  MAE: 12090.93651303665
  R2: -17684470.49075366
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 22094)             44189     
_________________________________________________________________
dense (Dense)                (None, 7)                 154665    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 56        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 8         
Total params: 198,974
Trainable params: 154,785
Non-trainable params: 44,189
_________________________________________________________________
Epoch 1/46
Epoch 2/46
Epo

[32m[I 2021-08-15 14:30:56,392][0m Trial 9 finished with value: 30724.261575992754 and parameters: {'hidden_units': 7, 'lr': 0.00011209682434636478, 'epochs': 46}. Best is trial 5 with value: 8578.914172656961.[0m


MLP model
  RMSE: 30724.261575992754
  MAE: 5587.881144781618
  R2: -3686777.5077823913


## Random Forest

In [23]:
def random_forest(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run():
        rf = RandomForestRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
            min_samples_split=params["min_samples_split"],
            random_state=0
        )
        rf.fit(train, train_labels)
        
        predictions = rf.predict(val)
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)
        
        print("Random Forest model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Random Forest's hyperparameters

In [24]:
study = optuna.create_study()
study.optimize(random_forest, n_trials=10)

[32m[I 2021-08-15 14:30:56,404][0m A new study created in memory with name: no-name-538256ab-8067-462e-b0d8-20241441c7cb[0m
[32m[I 2021-08-15 14:55:07,954][0m Trial 0 finished with value: 16.01800413886942 and parameters: {'n_estimators': 80, 'max_depth': 9, 'min_samples_split': 2}. Best is trial 0 with value: 16.01800413886942.[0m


Random Forest model
  RMSE: 16.01800413886942
  MAE: 8.763791506249913
  R2: -0.0020766497823212493


[32m[I 2021-08-15 15:03:32,842][0m Trial 1 finished with value: 16.00618181870716 and parameters: {'n_estimators': 60, 'max_depth': 4, 'min_samples_split': 4}. Best is trial 1 with value: 16.00618181870716.[0m


Random Forest model
  RMSE: 16.00618181870716
  MAE: 8.766755456279734
  R2: -0.0005980012564812398


[32m[I 2021-08-15 15:09:10,070][0m Trial 2 finished with value: 16.004510507402266 and parameters: {'n_estimators': 52, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.004510507402266
  MAE: 8.767637353258664
  R2: -0.0003890540557585087


[32m[I 2021-08-15 15:27:57,379][0m Trial 3 finished with value: 16.009817491732 and parameters: {'n_estimators': 110, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.009817491732
  MAE: 8.767908043798007
  R2: -0.0010526081523261066


[32m[I 2021-08-15 15:52:24,326][0m Trial 4 finished with value: 16.018543658213876 and parameters: {'n_estimators': 81, 'max_depth': 9, 'min_samples_split': 2}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.018543658213876
  MAE: 8.764184076726282
  R2: -0.0021441549268359505


[32m[I 2021-08-15 16:12:57,097][0m Trial 5 finished with value: 16.007022061868554 and parameters: {'n_estimators': 149, 'max_depth': 4, 'min_samples_split': 3}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.007022061868554
  MAE: 8.767472594139605
  R2: -0.0007030566288142026


[32m[I 2021-08-15 16:38:02,361][0m Trial 6 finished with value: 16.012359364315188 and parameters: {'n_estimators': 123, 'max_depth': 6, 'min_samples_split': 3}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.012359364315188
  MAE: 8.766966330425877
  R2: -0.0013705068640099682


[32m[I 2021-08-15 17:11:44,094][0m Trial 7 finished with value: 16.023377669128543 and parameters: {'n_estimators': 112, 'max_depth': 9, 'min_samples_split': 3}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.023377669128543
  MAE: 8.767897510034356
  R2: -0.002749092160128752


[32m[I 2021-08-15 17:35:59,234][0m Trial 8 finished with value: 16.009428236186576 and parameters: {'n_estimators': 142, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.009428236186576
  MAE: 8.767004935778605
  R2: -0.0010039304528941528


[32m[I 2021-08-15 17:46:42,480][0m Trial 9 finished with value: 16.007012685276973 and parameters: {'n_estimators': 62, 'max_depth': 5, 'min_samples_split': 3}. Best is trial 2 with value: 16.004510507402266.[0m


Random Forest model
  RMSE: 16.007012685276973
  MAE: 8.76435357727949
  R2: -0.0007018842457098273


## Gradient Boosting with LightGBM

In [25]:
def gradient_boosting(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "num_leaves": trial.suggest_int("num_leaves", 25, 35),
        "max_depth": trial.suggest_int("max_depth", 3, 10)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run():
        model = LGBMRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
            num_leaves=params["num_leaves"],
        )
        model.fit(train, train_labels)
        
        predictions = model.predict(test)
        print('Prediction: %.3f' % predictions[0])
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        print("LGBM model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_class":"LightGBM",
                "estimator_name":"Gradient Boosting"
            }
        )
        mlflow.sklearn.log_model(model, "model")
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Gradient Boosting's hyperparameters

In [26]:
study = optuna.create_study()
study.optimize(gradient_boosting, n_trials=10)

[32m[I 2021-08-15 17:46:42,488][0m A new study created in memory with name: no-name-1911296e-828b-41a8-bd2b-892cb54c52f7[0m
[33m[W 2021-08-15 17:46:55,907][0m Trial 0 failed because of the following error: LightGBMError('Do not support special JSON characters in feature name.')
Traceback (most recent call last):
  File "/home/igor/.local/lib/python3.9/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-25-fce15f02147c>", line 17, in gradient_boosting
    model.fit(train, train_labels)
  File "/usr/lib/python3.9/site-packages/lightgbm/sklearn.py", line 818, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "/usr/lib/python3.9/site-packages/lightgbm/sklearn.py", line 683, in fit
    self._Booster = train(params, train_set,
  File "/usr/lib/python3.9/site-packages/lightgbm/engine.py", line 228, in train
    booster = Booster(params=params, train_set=train_set)
  File "/usr/lib

LightGBMError: Do not support special JSON characters in feature name.