In [None]:
library(tidyverse)
library(tidymodels)
library(themis)
library(dplyr)
options(repr.matrix.rows = 5)

players_url <- "https://raw.githubusercontent.com/elilyliu/Group-4-project/refs/heads/main/players.csv"
sessions_url <- "https://raw.githubusercontent.com/elilyliu/Group-4-project/refs/heads/main/sessions.csv"

players <- read_csv(players_url) |> rename(age = Age)
sessions <- read_csv(sessions_url)

players
sessions

In [None]:
players <- players |>
    mutate(experience = factor(experience, levels = c("Beginner", "Amateur", "Regular", "Pro", "Veteran")),
           name = as_factor(name),
           gender = as_factor(name),
           contributor_type = case_when(
              played_hours > 3 ~ "High",
              played_hours <= 3 & played_hours > 1 ~ "Medium",
              played_hours <= 1 ~ "Low")) |>
    mutate(contributor_type = factor(contributor_type)) |>
    drop_na()

session_counts
players

In [None]:
selected_players <- players  |>
    mutate(experience_label = as.numeric(experience)) |>
    select(experience_label, age, contributor_type)

selected_players

In [None]:
player_split <- initial_split(selected_players, prop = 0.7, strata = contributor_type)  
player_train <- training(player_split)   
player_test <- testing(player_split)

player_recipe <- recipe(contributor_type ~ . , data = player_train) |>
    step_normalize(all_predictors()) |>
    step_upsample(contributor_type, over_ratio = 2, skip = TRUE)

knn_tune <- nearest_neighbor(weight_func="rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

## use 3 fold because of lack of data
player_vfold <- vfold_cv(player_train, v = 3, strata = contributor_type)

k_vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

knn_results <- workflow() |>
    add_recipe(player_recipe) |>
    add_model(knn_tune) |>
    tune_grid(resamples = player_vfold, grid = k_vals) |>
    collect_metrics() |>
    filter(.metric == "accuracy")
cross_val_plot <- ggplot(knn_results, aes(x=neighbors, y=mean)) +
    geom_point() +
    geom_line() +
    labs(x="Number of neighbors (K)", y="Accuracy", title = "Accuracy vs Number of Neighbors")

knn_results

cross_val_plot