In [35]:
import os
from os import walk
import gc
import warnings
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import mlflow
import mlflow.sklearn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import mixed_precision
from tensorflow.keras.layers.experimental import preprocessing

import optuna

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    confusion_matrix,
    classification_report,
    accuracy_score
)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import lightgbm
from lightgbm import LGBMRegressor

In [37]:
#mixed_precision.set_global_policy('mixed_float16')
# physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Loading and processing data

In [38]:
df = pd.read_csv('project1_output.csv')

In [39]:
# we'll get a subset of our dataset in order to make experiments faster
df = df[:1000]
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

#### Converting cols to their appropriate types again because we lost it on the csv export

In [40]:
df["product_category_name"] = df["product_category_name"].astype('category')
df["order_status"] = df["order_status"].astype('category')
df["review_score"] = df["review_score"].astype('category')
df["payment_type"] = df["payment_type"].astype('category')
df["customer_zip_code_prefix"] = df["customer_zip_code_prefix"].astype('category')
df["customer_city"] = df["customer_city"].astype('category')
df["customer_state"] = df["customer_state"].astype('category')
df["seller_zip_code_prefix"] = df["seller_zip_code_prefix"].astype('category')
df["seller_city"] = df["seller_city"].astype('category')
df["seller_state"] = df["seller_state"].astype('category')

df["product_name_lenght"] = df["product_name_lenght"].astype('int64')
df["product_description_lenght"] = df["product_description_lenght"].astype('int64')
df["product_photos_qty"] = df["product_photos_qty"].astype('int64')
df["payment_installments"] = df["payment_installments"].astype('int64')
df["payment_sequential"] = df["payment_sequential"].astype('int64')

In [41]:
df.dtypes

order_id                           object
order_item_id                       int64
product_id                         object
seller_id                          object
shipping_limit_date                object
price                             float64
freight_value                     float64
product_category_name            category
product_name_lenght                 int64
product_description_lenght          int64
product_photos_qty                  int64
product_weight_g                  float64
product_length_cm                 float64
product_height_cm                 float64
product_width_cm                  float64
customer_id                        object
order_status                     category
order_purchase_timestamp           object
order_approved_at                  object
order_delivered_carrier_date       object
order_delivered_customer_date      object
order_estimated_delivery_date      object
review_id                          object
review_score                     c

#### Droping unnecessary columns
We only want to work with numerical values.

In [42]:
df = df.select_dtypes(exclude=['object'])

In [43]:
df.columns

Index(['order_item_id', 'price', 'freight_value', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'order_status', 'review_score',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'customer_zip_code_prefix', 'customer_city',
       'customer_state', 'seller_zip_code_prefix', 'seller_city',
       'seller_state', 'payment_value_norm', 'volume'],
      dtype='object')

We'll also drop the `payment_value` column because our model would simply infer our target value from it by subtracting it from the `price` column.

In [44]:
df = df.drop(columns=['payment_value'])

### Feature engineering

Here we'll one-hot encode all of our categorical columns, and then drop the original ones

In [45]:
df = pd.get_dummies(df)
df = df.select_dtypes(exclude=['category'])

Even though we generated over 22000 columns this way, we believe that our model will be powerful enough to filter out any unecessary data.

## Picking column for prediction

We chose the `freight_value` column so we can perform a regression in order to try to find it's value based on all of the columns we have available.

In [46]:
TARGET_VALUE = 'freight_value'

In [47]:
target_col = df[TARGET_VALUE]

In [48]:
target_col

0      13.29
1      19.93
2      17.87
3      12.79
4      18.14
       ...  
995    74.99
996    34.98
997    17.03
998    19.07
999    15.59
Name: freight_value, Length: 1000, dtype: float64

In [49]:
df = df.drop(columns=[TARGET_VALUE])

## Separating prediction and test data

We'll split our data in a 60/20/20 ratio.

In [50]:
def get_x_data():
    # input 
    train, val, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    
    return train, val, test

In [51]:
def get_y_data():
    # output
    train_labels, val_labels, test_labels = (
        np.split(
            target_col, 
            [int(.6*len(target_col)), int(.8*len(target_col))])
    )
    
    return train_labels, val_labels, test_labels

In [53]:
# deleting our initial df so we can free up some RAM
# del df

# Picking 4 ML algorithms

We'll use the following 4 algorithms:

1. Linear regression
2. Multilayer perceptron (a shallow one)
3. random forests
4. lightgbm/xgboost

### Metrics function

In [54]:
# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

### Enabling MLFlow autologging

In [55]:
mlflow.sklearn.autolog()
mlflow.tensorflow.autolog()
mlflow.lightgbm.autolog()



## Linear regression
Let's start off with linear regression, which is the most simple algorithm in our selection, and will serve as a baseline for the following algorithms.

In [68]:
def linear_regression(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
    with mlflow.start_run(run_name="Linear Regression"):
        reg = LinearRegression()
        reg.fit(train, train_labels)

        predictions = reg.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("Linear regression model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        #mlflow.sklearn.log_model(reg, "model")
        #modelpath = "./mlflow/freight_value/model-linear-reg"
        #mlflow.sklearn.save_model(reg, modelpath)
        
        gc.collect()
        
        return rmse

In [69]:
study = optuna.create_study()
study.optimize(linear_regression, n_trials=1)

[32m[I 2021-08-15 23:18:20,072][0m A new study created in memory with name: no-name-1ab990b0-3c12-43b9-adb1-14d021113dc0[0m
[32m[I 2021-08-15 23:18:20,796][0m Trial 0 finished with value: 29112902109.472824 and parameters: {}. Best is trial 0 with value: 29112902109.472824.[0m


Linear regression model
  RMSE: 29112902109.472824
  MAE: 5182253386.249226
  R2: -4.027796194382081e+18


## Multilayer Perceptron

In [71]:
def mlp(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    # hyper-parameters to test
    params = {
        "hidden_units": trial.suggest_int("hidden_units", 3, 15),
        "lr": trial.suggest_float("lr", 1e-5, 1e-3, log=True),
        "epochs": trial.suggest_int("epochs", 10, 50)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # Start an MLflow run
    with mlflow.start_run(run_name="MLP"):
        normalizer = preprocessing.Normalization(axis=-1)
        normalizer.adapt(np.array(train))
        
        mlp_model = tf.keras.Sequential([
            normalizer,
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=params["hidden_units"]),
            layers.Dense(units=1),
        ])

        mlp_model.summary()
        
        mlp_model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=params["lr"]),
            loss='mean_squared_error'
        )

        history = mlp_model.fit(
            train, train_labels,
            validation_data=(test, test_labels),
            epochs=params["epochs"]
        )
        
        predictions = mlp_model.predict(val)

        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        # Print out model metrics
        print("MLP model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_name":"MultiLayerPerceptron",
                "estimator_class":"Keras"
            }
        )
        #mlflow.tensorflow.log_model(mlp_model, "model")
        #modelpath = "./mlflow/freight_value/model-mlp"
        #mlflow.tensorflow.save_model(mlp_model, modelpath)
        tf.keras.backend.clear_session()

        gc.collect()
        
        return rmse

### Using optuna to optimize MLP's hyperparameters

In [72]:
study = optuna.create_study()
study.optimize(mlp, n_trials=10)

[32m[I 2021-08-15 23:18:42,752][0m A new study created in memory with name: no-name-e2da339c-7d05-4dee-aaf1-126681e17aa8[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 11)                20416     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 24,403
Trainable params: 20,692
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epo

[32m[I 2021-08-15 23:18:48,471][0m Trial 0 finished with value: 624921.3328407971 and parameters: {'hidden_units': 11, 'lr': 1.9125171219773786e-05, 'epochs': 50}. Best is trial 0 with value: 624921.3328407971.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 14)                25984     
_________________________________________________________________
dense_1 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 15        
Total params: 30,130
Trainable params: 26,419
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/41
Epoch 2/41
Epoch 3/41
Epoch 4/41
Epoch 5/41
Epoch 6/41
Epoch 7/41
Epoch 8/41
Epoch 9/41
Epoch 10/41
Epo

[32m[I 2021-08-15 23:18:52,891][0m Trial 1 finished with value: 2219201.960081642 and parameters: {'hidden_units': 14, 'lr': 0.00015671785938101582, 'epochs': 41}. Best is trial 0 with value: 624921.3328407971.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 9)                 16704     
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 10        
Total params: 20,605
Trainable params: 16,894
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epo

[32m[I 2021-08-15 23:18:56,484][0m Trial 2 finished with value: 3182215.5674827946 and parameters: {'hidden_units': 9, 'lr': 0.0006868347812350454, 'epochs': 19}. Best is trial 0 with value: 624921.3328407971.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 13)                24128     
_________________________________________________________________
dense_1 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_2 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 14        
Total params: 28,217
Trainable params: 24,506
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/44
Epoch 2/44
Epoch 3/44
Epoch 4/44
Epoch 5/44
Epoch 6/44
Epoch 7/44
Epoch 8/44
Epoch 9/44
Epoch 10/44
Epo

[32m[I 2021-08-15 23:19:01,620][0m Trial 3 finished with value: 4673590.187370653 and parameters: {'hidden_units': 13, 'lr': 0.00042389811603947005, 'epochs': 44}. Best is trial 0 with value: 624921.3328407971.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 10)                18560     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 22,502
Trainable params: 18,791
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/31
Epoch 2/31
Epoch 3/31
Epoch 4/31
Epoch 5/31
Epoch 6/31
Epoch 7/31
Epoch 8/31
Epoch 9/31
Epoch 10/31
Epo

[32m[I 2021-08-15 23:19:07,711][0m Trial 4 finished with value: 400518.0905780176 and parameters: {'hidden_units': 10, 'lr': 5.471646247400232e-05, 'epochs': 31}. Best is trial 4 with value: 400518.0905780176.[0m


MLP model
  RMSE: 400518.0905780176
  MAE: 275878.06405936053
  R2: -762326049.939212
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 11)                20416     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 24,403
Trainable params: 20,692
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/36
Epoch 2/36
Epoch 

[32m[I 2021-08-15 23:19:13,782][0m Trial 5 finished with value: 2305463.1729115774 and parameters: {'hidden_units': 11, 'lr': 0.00020729701325138694, 'epochs': 36}. Best is trial 4 with value: 400518.0905780176.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 13)                24128     
_________________________________________________________________
dense_1 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_2 (Dense)              (None, 13)                182       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 14        
Total params: 28,217
Trainable params: 24,506
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/31
Epoch 2/31
Epoch 3/31
Epoch 4/31
Epoch 5/31
Epoch 6/31
Epoch 7/31
Epoch 8/31
Epoch 9/31
Epoch 10/31
Epo

[32m[I 2021-08-15 23:19:18,337][0m Trial 6 finished with value: 1994348.617255427 and parameters: {'hidden_units': 13, 'lr': 0.00019116809598850103, 'epochs': 31}. Best is trial 4 with value: 400518.0905780176.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 8)                 14848     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 18,712
Trainable params: 15,001
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/23
Epoch 2/23
Epoch 3/23
Epoch 4/23
Epoch 5/23
Epoch 6/23
Epoch 7/23
Epoch 8/23
Epoch 9/23
Epoch 10/23
Epo

[32m[I 2021-08-15 23:19:22,068][0m Trial 7 finished with value: 466616.8559451672 and parameters: {'hidden_units': 8, 'lr': 0.0001038309063156014, 'epochs': 23}. Best is trial 4 with value: 400518.0905780176.[0m


MLP model
  RMSE: 466616.8559451672
  MAE: 320527.0998730875
  R2: -1034706870.4072714
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 15)                27840     
_________________________________________________________________
dense_1 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_2 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 16        
Total params: 32,047
Trainable params: 28,336
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/42
Epoch 2/42
Epoch

[32m[I 2021-08-15 23:19:26,337][0m Trial 8 finished with value: 835167.256258795 and parameters: {'hidden_units': 15, 'lr': 4.701475632072753e-05, 'epochs': 42}. Best is trial 4 with value: 400518.0905780176.[0m


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 1855)              3711      
_________________________________________________________________
dense (Dense)                (None, 11)                20416     
_________________________________________________________________
dense_1 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_2 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12        
Total params: 24,403
Trainable params: 20,692
Non-trainable params: 3,711
_________________________________________________________________
Epoch 1/49
Epoch 2/49
Epoch 3/49
Epoch 4/49
Epoch 5/49
Epoch 6/49
Epoch 7/49
Epoch 8/49
Epoch 9/49
Epoch 10/49
Epo

[32m[I 2021-08-15 23:19:31,031][0m Trial 9 finished with value: 4351368.467343371 and parameters: {'hidden_units': 11, 'lr': 0.0002207810122763302, 'epochs': 49}. Best is trial 4 with value: 400518.0905780176.[0m


## Random Forest

In [75]:
def random_forest(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run(run_name="Random Forest"):
        rf = RandomForestRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
            min_samples_split=params["min_samples_split"],
            random_state=0
        )
        rf.fit(train, train_labels)
        
        predictions = rf.predict(val)
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)
        
        print("Random Forest model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Random Forest's hyperparameters

In [76]:
study = optuna.create_study()
study.optimize(random_forest, n_trials=10)

[32m[I 2021-08-15 23:20:15,810][0m A new study created in memory with name: no-name-da75c8d1-75bb-4408-bc26-0aefd78c9d87[0m


Random Forest model
  RMSE: 15.620222965775662
  MAE: 9.476533890732815
  R2: -0.15950051164022394


[32m[I 2021-08-15 23:20:18,390][0m Trial 0 finished with value: 15.620222965775662 and parameters: {'n_estimators': 126, 'max_depth': 10, 'min_samples_split': 5}. Best is trial 0 with value: 15.620222965775662.[0m
[32m[I 2021-08-15 23:20:19,991][0m Trial 1 finished with value: 15.144502563772498 and parameters: {'n_estimators': 53, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.144502563772498
  MAE: 9.326731791358458
  R2: -0.0899498430691672


[32m[I 2021-08-15 23:20:22,371][0m Trial 2 finished with value: 15.583823974990308 and parameters: {'n_estimators': 121, 'max_depth': 9, 'min_samples_split': 2}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.583823974990308
  MAE: 9.444342100634671
  R2: -0.15410296067238027


[32m[I 2021-08-15 23:20:25,325][0m Trial 3 finished with value: 15.523249094305234 and parameters: {'n_estimators': 136, 'max_depth': 8, 'min_samples_split': 3}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.523249094305234
  MAE: 9.41808969746865
  R2: -0.14514831916709037


[32m[I 2021-08-15 23:20:28,067][0m Trial 4 finished with value: 15.604549195735412 and parameters: {'n_estimators': 113, 'max_depth': 9, 'min_samples_split': 5}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.604549195735412
  MAE: 9.453054180235778
  R2: -0.15717472840418956


[32m[I 2021-08-15 23:20:30,048][0m Trial 5 finished with value: 15.201076053667675 and parameters: {'n_estimators': 124, 'max_depth': 6, 'min_samples_split': 5}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.201076053667675
  MAE: 9.290590158400661
  R2: -0.09810824091884562


[32m[I 2021-08-15 23:20:32,525][0m Trial 6 finished with value: 15.579354482519875 and parameters: {'n_estimators': 132, 'max_depth': 9, 'min_samples_split': 4}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.579354482519875
  MAE: 9.44939450851409
  R2: -0.15344105448056888


[32m[I 2021-08-15 23:20:35,295][0m Trial 7 finished with value: 15.511107924459566 and parameters: {'n_estimators': 129, 'max_depth': 8, 'min_samples_split': 2}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.511107924459566
  MAE: 9.410296689013812
  R2: -0.14335771424653876


[32m[I 2021-08-15 23:20:37,005][0m Trial 8 finished with value: 15.564385324854749 and parameters: {'n_estimators': 93, 'max_depth': 8, 'min_samples_split': 2}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.564385324854749
  MAE: 9.42176292040156
  R2: -0.15122559091557242


[32m[I 2021-08-15 23:20:39,238][0m Trial 9 finished with value: 15.394790328522102 and parameters: {'n_estimators': 141, 'max_depth': 7, 'min_samples_split': 4}. Best is trial 1 with value: 15.144502563772498.[0m


Random Forest model
  RMSE: 15.394790328522102
  MAE: 9.36167843320635
  R2: -0.1262739615836599


## Gradient Boosting with LightGBM

In [77]:
def gradient_boosting(trial):
    train, test, val = get_x_data()
    train_labels, val_labels, test_labels = get_y_data()
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 150),
        "num_leaves": trial.suggest_int("num_leaves", 25, 35),
        "max_depth": trial.suggest_int("max_depth", 3, 10)
    }
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    with mlflow.start_run(run_name="Gradient Boosting"):
        model = LGBMRegressor(
            max_depth=params["max_depth"],
            n_estimators=params["n_estimators"],
            num_leaves=params["num_leaves"],
        )
        model.fit(train, train_labels)
        
        predictions = model.predict(test)
        print('Prediction: %.3f' % predictions[0])
        
        (rmse, mae, r2) = eval_metrics(val_labels, predictions)

        print("LGBM model")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log mlflow attributes for mlflow UI
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)
        mlflow.log_params(trial.params)
        mlflow.set_tags(
            {
                "estimator_class":"LightGBM",
                "estimator_name":"Gradient Boosting"
            }
        )
        mlflow.sklearn.log_model(model, "model")
        
        gc.collect()
        
        return rmse

### Using optuna to optimize Gradient Boosting's hyperparameters

In [78]:
study = optuna.create_study()
study.optimize(gradient_boosting, n_trials=10)

[32m[I 2021-08-15 23:21:35,704][0m A new study created in memory with name: no-name-d3440b49-befb-4233-846f-5ead8bf6fead[0m
[33m[W 2021-08-15 23:21:35,857][0m Trial 0 failed because of the following error: PermissionError(13, 'Permission denied')
Traceback (most recent call last):
  File "/Users/gabriela/.pyenv/versions/3.7.7/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-77-6423dee78832>", line 43, in gradient_boosting
    mlflow.sklearn.log_model(model, "model")
  File "/Users/gabriela/.pyenv/versions/3.7.7/lib/python3.7/site-packages/mlflow/sklearn/__init__.py", line 319, in log_model
    await_registration_for=await_registration_for,
  File "/Users/gabriela/.pyenv/versions/3.7.7/lib/python3.7/site-packages/mlflow/models/model.py", line 188, in log
    mlflow.tracking.fluent.log_artifacts(local_path, artifact_path)
  File "/Users/gabriela/.pyenv/versions/3.7.7/lib/python3.7/site-packages/m

Prediction: 18.843
LGBM model
  RMSE: 15.203447732259674
  MAE: 9.947592102003982
  R2: -0.09845092231379082


PermissionError: [Errno 13] Permission denied: '/gabriela'