In [None]:
library(tidyverse)
library(tidymodels)
library(vroom)
library(embed)
library(bonsai)
library(lightgbm)

train <- vroom("/kaggle/input/allstate-claims-severity/train.csv")
test <- vroom("/kaggle/input/allstate-claims-severity/test.csv")


## Build Recipe
After loading the libraries and necessary data, we need to build our recipe. I removed ID, combined categorical variables that were too infrequent, target encoded categorical variables, removed variables that were highly correlated with each other, normalized numeric variables, and removed any variables that had zero variance.

In [None]:
my_recipe <- recipe(loss ~ ., data = train) %>%
  step_rm(id) %>% #ID not predictive
  step_other(all_nominal_predictors(), threshold = .001) %>% #combine categorical variables that are too small
  step_lencode_mixed(all_nominal_predictors(), outcome = vars(loss)) %>% #target encode categorical variables
  step_corr(all_numeric_predictors(), threshold = 0.6) %>% #remove variables with a high correlation with other variables
  step_normalize(all_numeric_predictors())%>% #normalize variable values
  step_zv(all_predictors())#remove any predictors with no variance

## Find Optimal Tuning Parameters
After building our recipe, we need to build our model. A boosted tree has the tuning parameters of tree depth, number of trees, and the learn rate. We will tune each of these parameters using cross validation. The engine is light gbm (you can also use xg boost) and the mode is regression. We are optiizing our tuning parameters using Mean Absolute Error, because that is the metric the competition is measuring models with.

In [None]:
boost_model <- boost_tree(tree_depth=tune(),
                          trees=tune(),
                          learn_rate=tune()) %>%
set_engine("lightgbm") %>%
  set_mode("regression")

Boost_wf <- workflow() %>%
  add_recipe(my_recipe) %>%
  add_model(boost_model)

## CV tune, finalize and predict here and save results
## Grid of values to tune over
tuning_grid <- grid_regular(tree_depth(),
                            trees(),
                            learn_rate(),
                            levels = 3) ## L^2 total tuning possibilities

## Split data for CV15
## Split data for CV15
folds <- vfold_cv(train, v = 5, repeats=1)

## Run the CV
CV_results <- Boost_wf %>%
  tune_grid(resamples=folds,
            grid=tuning_grid,
            metrics=metric_set(mae)) #Or leave metrics NULL

#Find the best tuning parameters
bestTune <- CV_results %>%
  select_best('mae')



## Make Predictions
Now that we have the optimal tuning parameters in "bestTune", we can feed that into our final workflow to generate predictions.

In [None]:
final_wf <- Boost_wf %>%
  finalize_workflow(bestTune) %>%
  fit(data=train)

Allstate_preds <- predict(final_wf, new_data=test)

#format submission
submission <- Allstate_preds %>%
  mutate(id = test$id) %>%
  mutate(loss = .pred) %>% #transform back to original scale
  select(2, 3)

vroom_write(submission, "submission.csv", delim = ",")