In [20]:
library(repr)
library(tidyverse)
library(dplyr)
library(tidyverse)
library(tidymodels)
library(cowplot)

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdials       [39m 1.3.0     [32m✔[39m [34mtune        [39m 1.1.2
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mworkflows   [39m 1.1.4
[32m✔[39m [34mmodeldata   [39m 1.4.0     [32m✔[39m [34mworkflowsets[39m 1.0.1
[32m✔[39m [34mparsnip     [39m 1.2.1     [32m✔[39m [34myardstick   [39m 1.3.1
[32m✔[39m [34mrecipes     [39m 1.1.0     

── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mscales[39m::[32mdiscard()[39m masks [34mpurrr[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m   masks [34mstats[39m::filter()
[31m✖[39m [34mrecipes[39m::[32mfixed()[39m  masks [34mstringr[39m::fixed()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m      masks [34mstats[39m::lag()
[31m✖[39m [3

In [5]:
players <-read_csv("data/players.csv") |>
select(experience, subscribe, played_hours, gender, age)
players

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, age
[33mlgl[39m (3): subscribe, individualId, organizationName

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,played_hours,gender,age
<chr>,<lgl>,<dbl>,<chr>,<dbl>
Pro,TRUE,30.3,Male,9
Veteran,TRUE,3.8,Male,17
Veteran,FALSE,0.0,Male,17
Amateur,TRUE,0.7,Female,21
Regular,TRUE,0.1,Male,21
Amateur,TRUE,0.0,Female,17
Regular,TRUE,0.0,Female,19
Amateur,FALSE,0.0,Male,21
Amateur,TRUE,0.1,Male,17
Veteran,TRUE,0.0,Female,22


In [18]:
#data wrangling
# The age groups are divided into smaller groups for young ages as game server is likely having more young participants.
# played hour group has a <=1 group as great amount of players probably just signed up but not yet played with played hour 0.0
age_group <-function(age)
    {if (age >65) 
    {return ("seniors")}
     else if (age >34)
    {return ("working pop")}
     else if (age >25)
    {return ("young working pop")}
     else if (age > 18)
    {return ("uni age")}
     else
         {return ("teenager")}}


played_hour_group <-function(played_hours)
    {if (played_hours >30) 
    {return ("long")}
     else if (played_hours>10)
    {return ("medium")}
     else if (played_hours>1)
    {return ("short")}
     else
    {return ("very short")}}


players_newcol <- players |>
mutate(age_group = sapply(age, age_group)) |>
mutate(played_hour_group = sapply(played_hours, played_hour_group))
players_newcol

experience,subscribe,played_hours,gender,age,age_group,played_hour_group
<chr>,<lgl>,<dbl>,<chr>,<dbl>,<chr>,<chr>
Pro,TRUE,30.3,Male,9,teenager,long
Veteran,TRUE,3.8,Male,17,teenager,short
Veteran,FALSE,0.0,Male,17,teenager,very short
Amateur,TRUE,0.7,Female,21,uni age,very short
Regular,TRUE,0.1,Male,21,uni age,very short
Amateur,TRUE,0.0,Female,17,teenager,very short
Regular,TRUE,0.0,Female,19,uni age,very short
Amateur,FALSE,0.0,Male,21,uni age,very short
Amateur,TRUE,0.1,Male,17,teenager,very short
Veteran,TRUE,0.0,Female,22,uni age,very short


In [21]:
set.seed(999)
players_split <- initial_split(players_newcol, prop=0.7, strata = played_hour_group)
players_training <-training(players_split)
players_testing <-testing(players_split)

In [34]:
set.seed(999)
players_model<- nearest_neighbor(weight_func="rectangular", neighbor=tune()) |>
set_engine ("kknn") |>
set_mode("regression")

players_recipe <-recipe(played_hours ~experience, subscribe, gender, age_group, data = players_training)
players_recipe



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:   1
predictor: 1



In [96]:
set.seed(999)
#the neighbors2 generate a long warning message that I am not sure whether it affects final outcome.
# small folds for small data
players_vfold <- vfold_cv(players_training, v=2)
gridvals <-tibble(neighbors=seq(from=1, to=10, by=1))

neighbors2<-workflow() |>
            add_recipe(players_recipe) |>
            add_model(players_model) |>
            tune_grid(resamples=players_vfold,grid=gridvals)
neighbors2_results<-collect_metrics(neighbors2)
neighbors2_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,26.244316928,2,1.695758128,Preprocessor1_Model01
1,rsq,standard,0.021230984,2,0.014954422,Preprocessor1_Model01
2,rmse,standard,25.522233953,2,2.638677107,Preprocessor1_Model02
2,rsq,standard,0.008467247,2,0.007618218,Preprocessor1_Model02
3,rmse,standard,25.293195688,2,2.600965858,Preprocessor1_Model03
3,rsq,standard,0.006124863,2,0.005726337,Preprocessor1_Model03
4,rmse,standard,30.006778429,2,0.004045404,Preprocessor1_Model04
4,rsq,standard,0.043347962,2,0.002118601,Preprocessor1_Model04
5,rmse,standard,27.794896312,2,1.137639312,Preprocessor1_Model05
5,rsq,standard,0.042083131,2,0.001139176,Preprocessor1_Model05


In [98]:
#find min rmse
# dont know why the mean simply declines all the way for v=3 to 5, maybe too small data so overfitting.
#k = 20 has lowest mean when testing range, however the data set is small using k=20 seems irrational.
best_k<-neighbors2_results |>
filter(.metric == "rmse") |>
slice_min(mean, n=1) 
best_k

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
3,rmse,standard,25.2932,2,2.600966,Preprocessor1_Model03


In [99]:
best_k_model<-nearest_neighbor(weight_func="rectangular", neighbor=3) |>
set_engine ("kknn") |>
set_mode("regression")

players_workflow<-workflow() |>
            add_recipe(players_recipe) |>
            add_model(best_k_model)
players_workflow

══ Workflow ════════════════════════════════════════════════════════════════════
[3mPreprocessor:[23m Recipe
[3mModel:[23m nearest_neighbor()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
K-Nearest Neighbor Model Specification (regression)

Main Arguments:
  neighbors = 3
  weight_func = rectangular

Computational engine: kknn 
