In [23]:
# Técnica de Boosting

install.packages('tidymodels')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [24]:
library(tidymodels)

In [25]:
dados <- read.csv('automoveis_usados.csv')

In [26]:
head(dados)

Unnamed: 0_level_0,Estilo,Ano,Potencia_motor,Cilindros_motor,Consumo_estrada_km,Valor
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>,<int>
1,SUV 4 portas,2005,275,6,28.96812,29695
2,seda,2016,170,4,61.15492,30495
3,minivan de passageiros,2016,248,6,45.06152,37650
4,seda,2015,138,4,57.93624,16170
5,seda,1991,162,4,32.1868,2000
6,picape cabine estendida,2012,152,4,37.01482,19299


In [27]:
set.seed(6578)

df_split <- initial_split(dados, prop = 0.8)
df_split

<Training/Testing/Total>
<8655/2164/10819>

In [28]:
df_treino <- training(df_split)
df_teste <- testing(df_split)

In [29]:
set.seed(4556)

df_folds <- vfold_cv(df_treino, v = 5)

In [33]:
# Instalando e carregando o pacote XGBoost
install.packages('xgboost')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [34]:
library(xgboost)

In [35]:
# Definindo o modelo Boosting
# - boost_tree(): cria um modelo baseado em árvores
# - set_mode('regression'): indica que o problema é de regressão
# - set_engine('xgboost'): usa o motor do XGBoost
# - set_args(trees = 100): define o número de árvores (iterações do boosting)
boosting <- boost_tree() %>%
  set_mode('regression') %>%
  set_engine('xgboost') %>%
  set_args(trees = 100)

In [36]:
# Ajustando o modelo aos dados de treino
set.seed(4556)

modelo_boosting <- boosting %>%
  fit(Valor ~ Estilo + Ano + Potencia_motor, data = df_treino)

In [37]:
# Visualizando informações do modelo treinado
modelo_boosting

parsnip model object

##### xgb.Booster
raw: 349.6 Kb 
call:
  xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
    colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
    subsample = 1), data = x$data, nrounds = 100, watchlist = x$watchlist, 
    verbose = 0, nthread = 1, objective = "reg:squarederror")
params (as set within xgb.train):
  eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", nthread = "1", objective = "reg:squarederror", validate_parameters = "TRUE"
xgb.attributes:
  niter
callbacks:
  cb.evaluation.log()
# of features: 18 
niter: 100
nfeatures : 18 
evaluation_log:
  iter training_rmse
 <num>         <num>
     1     23356.511
     2     16914.813
   ---           ---
    99      3729.702
   100      3725.600

In [38]:
# Fazendo previsões no conjunto de teste
predicoes <- predict(modelo_boosting, new_data = df_teste)

In [39]:
predicoes_boosting <- bind_cols(df_teste, predicoes)

In [40]:
calcular_metricas <- function(predicoes, valores_reais) {
  resultado <- tibble(
    predicao = predicoes,
    real = valores_reais
  )

  metricas <- resultado %>%
    metrics(truth = real, estimate = predicao)

  return(metricas)
}

In [41]:
# Calculando métricas de desempenho do modelo Boosting
# - RMSE: raiz do erro quadrático médio
# - MAE: erro absoluto médio
# - R²: coeficiente de determinação
metricas_boosting <- calcular_metricas(predicoes_boosting$.pred, predicoes_boosting$Valor)
metricas_boosting

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,4483.2484389
rsq,standard,0.9201857
mae,standard,3024.2435636


In [42]:
# Avaliando a performance do modelo com validação cruzada (resampling)
set.seed(4456)

fits_cv <- fit_resamples(boosting,
                         Valor ~ Estilo + Ano + Potencia_motor,
                         resamples = df_folds,
                         metrics = metric_set(rmse, mae, rsq))

In [43]:
# Coletando métricas médias da validação cruzada
media_metricas_boosting <- collect_metrics(fits_cv)
media_metricas_boosting

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
mae,standard,2972.4183138,5,28.596628075,pre0_mod0_post0
rmse,standard,4506.4203139,5,81.098190087,pre0_mod0_post0
rsq,standard,0.9203949,5,0.002740729,pre0_mod0_post0


In [44]:
# Explorando o impacto dos parâmetros

# Testando o aumento no número de árvores (de 100 para 300)
set.seed(4456)

boosting_300 <- boost_tree() %>%
  set_mode('regression') %>%
  set_engine('xgboost') %>%
  set_args(trees = 300)

fits_cv <- fit_resamples(boosting_300,
                         Valor ~ Estilo + Ano + Potencia_motor,
                         resamples = df_folds,
                         metrics = metric_set(rmse, mae, rsq))

# Coletando as métricas com 300 árvores
media_metricas_boosting_300 <- collect_metrics(fits_cv)
media_metricas_boosting_300

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
mae,standard,2913.5514533,5,28.725372725,pre0_mod0_post0
rmse,standard,4487.6969849,5,79.79690809,pre0_mod0_post0
rsq,standard,0.9211871,5,0.002688116,pre0_mod0_post0


In [45]:
# Ajustando a taxa de aprendizado (learning rate)

# Taxa de aprendizado controla o "passo" dado a cada iteração.
# Valores menores tendem a melhorar a generalização, mas exigem mais árvores.
set.seed(4456)

boosting_final <- boost_tree(learn_rate = 0.2) %>%
  set_mode('regression') %>%
  set_engine('xgboost') %>%
  set_args(trees = 300)

fits_cv <- fit_resamples(boosting_final,
                         Valor ~ Estilo + Ano + Potencia_motor,
                         resamples = df_folds,
                         metrics = metric_set(rmse, mae, rsq))

media_metricas_boosting_final <- collect_metrics(fits_cv)
media_metricas_boosting_final

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
mae,standard,2899.472767,5,26.232357937,pre0_mod0_post0
rmse,standard,4454.513983,5,77.445477682,pre0_mod0_post0
rsq,standard,0.922277,5,0.002603691,pre0_mod0_post0
