In [1]:
library(tidyverse)
library(tidymodels)
library(readxl)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
raisin_data <- read_csv("https://github.com/dylankim990/DSCI100_Group_2/raw/main/data/Raisin_Dataset.csv") %>% mutate(Class = as_factor(Class))

Parsed with column specification:
cols(
  Area = [32mcol_double()[39m,
  MajorAxisLength = [32mcol_double()[39m,
  MinorAxisLength = [32mcol_double()[39m,
  Eccentricity = [32mcol_double()[39m,
  ConvexArea = [32mcol_double()[39m,
  Extent = [32mcol_double()[39m,
  Perimeter = [32mcol_double()[39m,
  Class = [31mcol_character()[39m
)



In [4]:
set.seed(5805)
raisin_split <- initial_split(raisin_data, prop = 0.75 ,strata = Class)
raisin_training <- training(raisin_split)
raisin_testing <- testing(raisin_split)

#choose columns first? 
#choose K; cross-validation
#choose predictor 

In [31]:
raisin_subset <- raisin_training %>%
  select(Area,
         MajorAxisLength, 
         MinorAxisLength, 
         Eccentricity, 
         ConvexArea, 
         Extent, 
         Perimeter, 
         Class)

set.seed(5805)

names <- colnames(select(raisin_subset, -Class))

# creating an empty tibble which will store the accuracy results
accuracies <- tibble(size = integer(), 
                     model_string = character(), 
                     accuracy = numeric(),
                     neighbor = numeric())

knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             neighbors = tune()) %>%
     set_engine("kknn") %>%
     set_mode("classification")

# 5-fold cross-validation
raisin_vfold <- vfold_cv(raisin_subset, v = 5, strata = Class)

# total # of predictors
n_total <- length(names)

# creating to store selected predictors
selected <- c()

#loops to try all the models
# for every size from 1 to the total number of predictors
for (i in 1:n_total) {
    # for every predictor still not added yet
    accs <- list()
    models <- list()
    neighborKs <- list()

#inner loop for model string for different combinations of predictors
    for (j in 1:length(names)) {
        # create a model string for the combinations of predictors
        preds_new <- c(selected, names[[j]])
        model_string <- paste("Class", "~", paste(preds_new, collapse="+"))
        
# created a recipe from the model string
        raisin_recipe <- recipe(as.formula(model_string), 
                                data = raisin_subset) %>%
                          step_scale(all_predictors()) %>%
                          step_center(all_predictors())
        
# tune the KNN classifier with these predictors, 
# and collect the accuracy for the best K
        acc_workflow <- workflow() %>%
          add_recipe(raisin_recipe) %>%
          add_model(knn_spec) %>%
          tune_grid(resamples = raisin_vfold, grid = 10) %>%
          collect_metrics() %>%
          filter(.metric == "accuracy")
        acc <- summarize(acc_workflow, mx = max(mean))
        acc <- acc$mx %>% unlist()
        
        neighborK <- acc_workflow %>%
            arrange(mean) %>%
            slice(1)
        neighborK <- select(ks, neighbors) %>% unlist()

# adding the result to the dataframe             
        accs[[j]] <- acc
        models[[j]] <- model_string
        neighborKs[[j]] <- neighborK
    }
    jstar <- which.max(unlist(accs))
        accuracies <- accuracies %>% 
          add_row(size = i, 
                  model_string = models[[jstar]], 
                  accuracy = accs[[jstar]],
                  neighbor = neighborKs[[jstar]])
        selected <- c(selected, names[[jstar]])
        names <- names[-jstar]
}
accuracies

size,model_string,accuracy,neighbor
<int>,<chr>,<dbl>,<dbl>
1,Class ~ Perimeter,0.8507902,2
2,Class ~ Perimeter+Area,0.8597015,2
3,Class ~ Perimeter+Area+Extent,0.8610623,2
4,Class ~ Perimeter+Area+Extent+ConvexArea,0.862511,2
5,Class ~ Perimeter+Area+Extent+ConvexArea+MajorAxisLength,0.8596137,2
6,Class ~ Perimeter+Area+Extent+ConvexArea+MajorAxisLength+Eccentricity,0.8610843,2
7,Class ~ Perimeter+Area+Extent+ConvexArea+MajorAxisLength+Eccentricity+MinorAxisLength,0.8581651,2
