# **Predicting player's engagement based on age**

### Introduction

### Method

In [None]:
#Loading the libraries
library(tidyverse)
library(tidymodels)
library(dplyr)
library(RColorBrewer)
set.seed(42)

In [None]:
players_url <- read_csv("https://raw.githubusercontent.com/emma-chow/DSCI-Final-Project/70bbf2c6fcb0a1fd395c3b650eb82c00067f8953/players.csv")
head(players_url)

In [None]:
players_missing <- players_url |> 
    sapply(function(x) sum(is.na(x)))
players_missing

In [None]:
players_data <- players_url |>
    drop_na()
glimpse(players_data)

In [None]:
players_summary <- players_data |>
    summary()
players_summary

In [None]:
players_hours_stats <- players_data |>
    summarise(played_hours_min = min(played_hours), played_hours_mean = mean(played_hours), played_hours_median = median(played_hours), played_hours_max = max(played_hours))
players_hours_stats

In [None]:
players <- players_data |>
select(subscribe, Age, played_hours)
head(players)

In [None]:
#change name
plot_1 <- players |>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(alpha = 0.7) +
    geom_hline(yintercept = 2, linetype = "dashed", colour = "blue") +
    geom_hline(yintercept = 15, linetype = "dashed", colour = "blue") +
    labs(x = "Age of player (years)", y = "Hours played (hours)", 
         title = "The relationship between the age of the player and playing hours", 
         subtitle = "where the hours played is split into low, medium and high by horizontal lines") +
    theme(text = element_text(size = 12))
plot_1

In [None]:
players_engagement <- players |>
mutate(engagement_level = factor((played_hours >= 15) + (played_hours >= 2), 
    levels = c(0, 1, 2), 
    labels = c("Low", "Medium", "High")))
head(players_engagement)

In [None]:
#remove this later plssss
players_proportions <- players_engagement |> 
    group_by(engagement_level) |>
    summarize(n = n()) |>
    mutate(percent = 100*n/nrow(players_engagement))
players_proportions

In [None]:
players_split <- initial_split(players_engagement, prop = 0.70, strata = engagement_level)  
players_train <- training(players_split)
players_test <- testing(players_split)

head(players_train)
head(players_test)

In [None]:
players_recipe <- recipe(engagement_level ~ Age, data = players_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 5) |>
    set_engine("kknn") |>
    set_mode("classification")

knn_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec) |>
    fit(data = players_train)

knn_fit

In [None]:
set.seed(1)
players_vfold <- vfold_cv(players_train, v = 5, strata = engagement_level)

knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification")

k_vals <- tibble(neighbors = seq(from = 1, to = 15, by = 1))

players_fit <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_tune) |>
    tune_grid(resamples = players_vfold, grid = k_vals) |>
    collect_metrics()

In [None]:
set.seed(2)

k_acc <- players_fit |>
    filter(.metric == "accuracy")

accuracy_vs_k <- ggplot(k_acc, aes(x = neighbors, y = mean)) +
    geom_point() +
    geom_line() +
    labs(x = "Neighbors", y = "Accuracy Estimate") +
    ggtitle("Figure x: Neighbours vs accuracy estiamte")
accuracy_vs_k

In [None]:
best_k <- k_acc |>
    arrange(desc(mean)) |>
    head(1) |>
    pull(neighbors)
best_k

In [None]:
set.seed(3)

knn_spec_best <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k) |>
    set_engine("kknn") |>
    set_mode("classification")

players_fit_best <- workflow() |>
    add_recipe(players_recipe) |>
    add_model(knn_spec_best) |>
    fit(data = players_train)

players_fit_best

In [None]:
players_test_predictions <- predict(players_fit_best, players_test) |>
    bind_cols(players_test)