In [7]:
# loading libraries
library(tidyverse)
library(repr)
library(tidymodels)
library(RColorBrewer)
library(cowplot)
options(repr.matrix.max.rows = 6)

# setting the seed
set.seed(1)

In [2]:
# Reading data
url <- "https://raw.githubusercontent.com/cpan0/project_proposal/main/diamonds.csv"
diamonds <- read_csv(url)
diamonds <- diamonds %>% 
    select(carat, cut, color, clarity, price) %>%   # selecting the necessary variables/columns
    mutate(cut = factor(cut, c("Ideal", "Premium", "Very Good", "Good", "Fair")), # Relevel cut from 'Ideal' to 'Fair'
          clarity = factor(clarity, c("IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"))) # Relevel clarity from 'IF' (internally flawless) to 'I1' (imperfect)
head(diamonds)

Parsed with column specification:
cols(
  carat = [32mcol_double()[39m,
  cut = [31mcol_character()[39m,
  color = [31mcol_character()[39m,
  clarity = [31mcol_character()[39m,
  depth = [32mcol_double()[39m,
  table = [32mcol_double()[39m,
  x = [32mcol_double()[39m,
  y = [32mcol_double()[39m,
  z = [32mcol_double()[39m,
  price = [32mcol_double()[39m
)



carat,cut,color,clarity,price
<dbl>,<fct>,<chr>,<fct>,<dbl>
0.23,Ideal,E,SI2,326
0.21,Premium,E,SI1,326
0.23,Good,E,VS1,327
0.29,Premium,I,VS2,334
0.31,Good,J,SI2,335
0.24,Very Good,J,VVS2,336


In [3]:
# splitting data
diamonds_split <- initial_split(diamonds, prop = 0.75, strata = price)
diamonds_train <- training(diamonds_split)
diamonds_test <- testing(diamonds_split) 

In [30]:
# cross-valdiation to find best K value

diamonds_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("regression") 

#recipe
diamonds_recipe <- recipe(price ~ ., data = diamonds_train) %>%
        step_scale(carat) %>%
        step_center(carat)


# cross validation
diamonds_vfold <- vfold_cv(diamonds_train, v = 5, strata = price)
diamonds_workflow <-  workflow() %>%
    add_recipe(diamonds_recipe) %>%
    add_model(diamonds_spec)



In [14]:
gridvals <- tibble(neighbors = seq(from = 1, to = 100, by = 10))
diamonds_results <- diamonds_workflow %>%
    tune_grid(resamples = diamonds_vfold, grid = gridvals) %>%
    collect_metrics()
k_min <- 11


In [15]:
diamonds_results <- diamonds_results %>%
     filter(.metric == "rmse") %>%
     arrange(mean)
diamonds_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
4,rmse,standard,713.9447,5,10.023857,Model04
5,rmse,standard,717.6793,5,10.194558,Model05
3,rmse,standard,724.6521,5,9.437013,Model03
⋮,⋮,⋮,⋮,⋮,⋮,⋮
13,rmse,standard,845.2177,5,14.52978,Model13
14,rmse,standard,860.6817,5,14.39916,Model14
15,rmse,standard,876.8761,5,13.75277,Model15


In [26]:
#duplicated(diamonds)

In [28]:
#diamonds[!duplicated(diamonds)]

ERROR: Error: Must subset columns with a valid subscript vector.
[34mℹ[39m Logical subscripts must match the size of the indexed input.
[31m✖[39m Input has size 5 but subscript `!duplicated(diamonds)` has size 53940.
