In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv('../data/beers.csv')

In [3]:
df.dropna(subset=['ibu'], inplace = True)

In [4]:
df.isna().sum()

id           0
name         0
abv          0
ibu          0
target_fg    0
target_og    0
ebc          4
srm          4
ph           1
dtype: int64

# Treinamento do Modelo

Neste notebook iremos realizar o treinamento de um modelo para prever o **ibu** das cervejas.

In [5]:
X = df.drop(['id', 'name', 'ibu'], axis = 1)
y = df['ibu']

In [6]:
X.head()

Unnamed: 0,abv,target_fg,target_og,ebc,srm,ph
0,4.5,1010.0,1044.0,20.0,10.0,4.4
1,4.1,1010.0,1041.7,15.0,15.0,4.4
2,4.2,1007.0,1040.0,8.0,4.0,3.2
3,6.3,1012.0,1060.0,30.0,15.0,4.4
4,7.2,1027.0,1069.0,10.0,5.0,4.4


In [7]:
y.head()

0    60.0
1    41.5
2     8.0
3    55.0
4    59.0
Name: ibu, dtype: float64

### Separação entre treino e teste.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((214, 6), (106, 6), (214,), (106,))

### Preenchimento dos valores faltantes

In [10]:
imp_const = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-99)
X_train_transformed = imp_const.fit_transform(X_train)

In [11]:
X_test_transformed = imp_const.transform(X_test)

### Treinamento do modelo

Escolhi modelos baseados em arvores, pois além de geralmente terem melhores resultados, também são robustos a outliers/normalização.

#### Random Forest

In [12]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, max_depth=5, random_state=42)
rf.fit(X_train_transformed, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [13]:
y_pred_rf = rf.predict(X_test_transformed)

In [14]:
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("O erro Absoluto médio do modelo é:", mae_rf)

O erro Absoluto médio do modelo é: 21.516850808043003


#### XGB

In [15]:
xgb = XGBRegressor(n_estimators=100, n_jobs=-1, max_depth=5, random_state=42)
xgb.fit(X_train_transformed, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=42, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [16]:
y_pred_xgb = xgb.predict(X_test_transformed)

In [17]:
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("O erro Absoluto médio do modelo é:", mae_xgb)

O erro Absoluto médio do modelo é: 20.478275456518496


#### Catboost

In [18]:
cbr = CatBoostRegressor(n_estimators=100, max_depth=5, random_state=42)
cbr.fit(X_train_transformed, y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x7f45f2cbe910>

In [19]:
y_pred_cbr = cbr.predict(X_test_transformed)

In [20]:
mae_cbr = mean_absolute_error(y_test, y_pred_cbr)
print("O erro Absoluto médio do modelo é:", mae_cbr)

O erro Absoluto médio do modelo é: 21.289806123130475


## Conclusão

In [21]:
pd.DataFrame([{
    'MAE Random Forest': mae_rf,
    'MAE XGBoost': mae_xgb,
    'MAE Catboost': mae_cbr
}])

Unnamed: 0,MAE Random Forest,MAE XGBoost,MAE Catboost
0,21.516851,20.478275,21.289806


Com isso, podemos concluir que o modelo vencedor é o **XGBoost**, pois pussui o menor erro.