# DSCI 100 Review

In [1]:
library(tidyverse)
library(tidymodels)

library(palmerpenguins)
options(repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [2]:
# Step 1: Clean data
penguins_clean <- penguins |> 
  drop_na()

# Step 2: Split data
set.seed(123)
penguin_split <- initial_split(penguins_clean, prop = 0.8)
train_data <- training(penguin_split)
test_data <- testing(penguin_split)

# Step 3: Create recipe
penguin_recipe <- recipe(species ~ bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g, data = train_data) |> 
  step_normalize(all_predictors())

# Step 4: Specify KNN model
knn_spec <- nearest_neighbor(mode = "classification", neighbors = 5) |> 
  set_engine("kknn")

# Step 5: Create workflow
penguin_workflow <- workflow() |> 
  add_model(knn_spec) |> 
  add_recipe(penguin_recipe)

# Step 6: Fit model
knn_fit <- penguin_workflow |> 
  fit(data = train_data)

# Step 7: Predict on test data
predictions <- predict(knn_fit, test_data) |> 
  bind_cols(test_data)

# Step 8: Evaluate performance
metrics(predictions, truth = species, estimate = .pred_class)

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,multiclass,0.9850746
kap,multiclass,0.9770312
