# **TITLE**

### Introduction

### Method

### Discussion

In [1]:
library(tidyverse)
library(tidymodels)
library(dplyr)
library(RColorBrewer)
set.seed(42)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [2]:
players_url <- read_csv("https://raw.githubusercontent.com/emma-chow/DSCI-Final-Project/70bbf2c6fcb0a1fd395c3b650eb82c00067f8953/players.csv")
head(players_url)

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21
Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21
Amateur,True,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,Female,17


In [3]:
players_missing <- players_url |> 
    sapply(function(x) sum(is.na(x)))
players_missing

In [4]:
players_data <- players_url |>
    drop_na()
glimpse(players_data)

Rows: 194
Columns: 7
$ experience   [3m[90m<chr>[39m[23m "Pro", "Veteran", "Veteran", "Amateur", "Regular", "Amate…
$ subscribe    [3m[90m<lgl>[39m[23m TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, T…
$ hashedEmail  [3m[90m<chr>[39m[23m "f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8…
$ played_hours [3m[90m<dbl>[39m[23m 30.3, 3.8, 0.0, 0.7, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 1.6, 0…
$ name         [3m[90m<chr>[39m[23m "Morgan", "Christian", "Blake", "Flora", "Kylie", "Adrian…
$ gender       [3m[90m<chr>[39m[23m "Male", "Male", "Male", "Female", "Male", "Female", "Fema…
$ Age          [3m[90m<dbl>[39m[23m 9, 17, 17, 21, 21, 17, 19, 21, 17, 22, 23, 17, 25, 22, 17…


In [5]:
players_summary <- players_data |>
    summary()
players_summary

  experience        subscribe       hashedEmail         played_hours    
 Length:194         Mode :logical   Length:194         Min.   :  0.000  
 Class :character   FALSE:52        Class :character   1st Qu.:  0.000  
 Mode  :character   TRUE :142       Mode  :character   Median :  0.100  
                                                       Mean   :  5.905  
                                                       3rd Qu.:  0.600  
                                                       Max.   :223.100  
     name              gender               Age       
 Length:194         Length:194         Min.   : 8.00  
 Class :character   Class :character   1st Qu.:17.00  
 Mode  :character   Mode  :character   Median :19.00  
                                       Mean   :20.52  
                                       3rd Qu.:22.00  
                                       Max.   :50.00  

In [6]:
players_hours_stats <- players_data |>
    summarise(played_hours_min = min(played_hours), played_hours_mean = mean(played_hours), played_hours_median = median(played_hours), played_hours_max = max(played_hours))
players_hours_stats

played_hours_min,played_hours_mean,played_hours_median,played_hours_max
<dbl>,<dbl>,<dbl>,<dbl>
0,5.904639,0.1,223.1


In [7]:
players <- players_data |>
select(subscribe, Age, played_hours)
head(players)

subscribe,Age,played_hours
<lgl>,<dbl>,<dbl>
True,9,30.3
True,17,3.8
False,17,0.0
True,21,0.7
True,21,0.1
True,17,0.0


In [8]:
players_engagement <- players |>
mutate(engagement_level = factor((played_hours >= 50) + (played_hours >= 10), 
    levels = c(0, 1, 2), 
    labels = c("Low", "Medium", "High")))
head(players_engagement)

subscribe,Age,played_hours,engagement_level
<lgl>,<dbl>,<dbl>,<fct>
True,9,30.3,Medium
True,17,3.8,Low
False,17,0.0,Low
True,21,0.7,Low
True,21,0.1,Low
True,17,0.0,Low


In [9]:
players_split <- initial_split(players_engagement, prop = 0.70, strata = engagement_level)  
players_train <- training(players_split)
players_test <- testing(players_split)

head(players_train)
head(players_test)

subscribe,Age,played_hours,engagement_level
<lgl>,<dbl>,<dbl>,<fct>
True,9,30.3,Medium
True,17,3.8,Low
False,17,0.0,Low
True,21,0.7,Low
True,21,0.1,Low
True,17,0.0,Low


subscribe,Age,played_hours,engagement_level
<lgl>,<dbl>,<dbl>,<fct>
True,19,0.0,Low
False,21,0.0,Low
True,22,0.0,Low
True,23,1.6,Low
True,17,0.0,Low
True,17,0.5,Low


In [14]:
players_vfold <- vfold_cv(players_train, v = 5, strata = engagement_level)