In [1]:
library(tidyverse)
library(tidymodels)
library(bonsai)
tidymodels_prefer()

x_feats <- c('X_Minimum', 'X_Maximum','Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas')
Y_feats <- c('Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults')

train_df <- read_csv("/kaggle/input/playground-series-s4e3/train.csv") %>%
    pivot_longer(cols = Y_feats, names_to = "faults") %>%
    filter(value == 1) %>%
    select(-value) %>%
    mutate(faults = as.factor(faults))

test_df <- read_csv("/kaggle/input/playground-series-s4e3/test.csv")
glimpse(train_df)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrsample     [39

Rows: 18,422
Columns: 29
$ id                    [3m[90m<dbl>[39m[23m 0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 13, 14, 15, 1…
$ X_Minimum             [3m[90m<dbl>[39m[23m 584, 808, 39, 781, 1540, 1009, 596, 1673, 893, 1…
$ X_Maximum             [3m[90m<dbl>[39m[23m 590, 816, 192, 789, 1560, 1033, 607, 1687, 907, …
$ Y_Minimum             [3m[90m<dbl>[39m[23m 909972, 728350, 2212076, 3353146, 618457, 899231…
$ Y_Maximum             [3m[90m<dbl>[39m[23m 909977, 728372, 2212144, 3353173, 618502, 899307…
$ Pixels_Areas          [3m[90m<dbl>[39m[23m 16, 433, 11388, 210, 521, 409, 204, 571, 54, 289…
$ X_Perimeter           [3m[90m<dbl>[39m[23m 8, 20, 705, 16, 72, 22, 21, 38, 6, 37, 27, 14, 3…
$ Y_Perimeter           [3m[90m<dbl>[39m[23m 5, 54, 420, 29, 67, 26, 21, 57, 15, 42, 29, 13, …
$ Sum_of_Luminosity     [3m[90m<dbl>[39m[23m 2274, 44478, 1311391, 3202, 48231, 47513, 22478,…
$ Minimum_of_Luminosity [3m[90m<dbl>[39m[23m 113, 70, 29, 114, 82, 86, 89, 77, 8

In [2]:
tune_spec <-
    boost_tree(
#         trees = tune(),
#         tree_depth = tune(), 
#         learn_rate = tune(),
#         min_n = tune(),
        loss_reduction = tune()
    ) %>%
    set_engine("lightgbm") %>%
    set_mode("classification")

In [3]:
grid <- grid_regular(
#     trees(),
#     tree_depth(),
#     learn_rate(),
#     min_n(),
    loss_reduction(),
    levels = 3
)

In [4]:
set.seed(1234)
folds <- vfold_cv(train_df, v = 3)

In [5]:
wf <- workflow() %>%
    add_model(tune_spec) %>%
    add_formula(faults ~ . -id)

In [6]:
res <- wf %>% 
    tune_grid(
        resamples = folds,
        grid = grid
    )

In [7]:
best <- res %>%
  select_best("roc_auc")

best

loss_reduction,.config
<dbl>,<chr>
5.623413e-05,Preprocessor1_Model2


In [8]:
final_wf <- wf %>% 
  finalize_workflow(best)

In [9]:
final_fit <- 
  final_wf %>%
  fit(train_df)

In [10]:
p <- predict(final_fit, new_data = test_df, type = "prob")

In [11]:
names(p) <- str_replace(names(p), ".pred_", "")
p %>% glimpse

Rows: 12,814
Columns: 7
$ Bumps        [3m[90m<dbl>[39m[23m 0.1594106955, 0.1402409796, 0.3039801090, 0.4148920324, 0…
$ Dirtiness    [3m[90m<dbl>[39m[23m 0.0066266866, 0.3815004536, 0.0053476400, 0.0066008073, 0…
$ K_Scatch     [3m[90m<dbl>[39m[23m 0.006601115, 0.009488231, 0.051726131, 0.001156029, 0.001…
$ Other_Faults [3m[90m<dbl>[39m[23m 0.27186119, 0.21413990, 0.58094457, 0.44675154, 0.3558481…
$ Pastry       [3m[90m<dbl>[39m[23m 0.5543466967, 0.2469835466, 0.0019098005, 0.1290015051, 0…
$ Stains       [3m[90m<dbl>[39m[23m 1.655978e-04, 1.667007e-04, 5.134799e-04, 3.174355e-04, 7…
$ Z_Scratch    [3m[90m<dbl>[39m[23m 0.0009880220, 0.0074801861, 0.0555782659, 0.0012806471, 0…


In [12]:
p %>%
    mutate(id = test_df$id) %>%
    write_csv("submission.csv")