# Loading the Data

In [1]:
library(tidyverse)
library(tidymodels)
library(GGally)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [None]:
sessions <- "data/sessions.csv" |>
    read_csv()

In [None]:
numrows <- sessions |>
    nrow()
sprintf("Number of Observations: %s", numrows)

# Wrangle and Cleaning the Data

Using the previously loaded in sessions file, the data we'll have to be tidied so that it can be used more effectively later.

To start, we'll rename the hashedEmail column to be consistent with the other columns and by separate starting and ending date and time into their own columns.

In [None]:
sessions_tidy <- sessions |>
    rename(hashed_email = hashedEmail, start_time1 = start_time, end_time1 = end_time) |> # temporary names added to start and end time to prevent errors in seperate
    separate(start_time1, into = c('start_date', 'start_time'), sep = ' ') |>
    separate(end_time1, into = c('end_date', 'end_time'), sep = ' ')
head(sessions_tidy, 3)

Great, next we'll have to convert start and end date and time to a useable form. We'll do that by creating a new column called weekdays and a column called month, made by converting our date to a Date type and then we'll convert our time to a numerical value. Finally, we'll create a length (in minutes) column by subtracting the start and end times.

In [None]:
sessions_wrangled <- sessions_tidy |>
# get weekdays
    mutate(weekday = as.factor(weekdays(strptime(start_date, "%d/%m/%Y")))) |>
# get months
    mutate(month = as.factor(months(strptime(start_date, "%d/%m/%Y")))) |>
# convert start_time
    separate(start_time, into = c('start_hour', 'start_minute'), sep = ':') |>
    mutate(start_time_dbl = as.numeric(start_hour) + (as.numeric(start_minute)/60))|>
# convert end_time
    separate(end_time, into = c('end_hour', 'end_minute'), sep = ':') |>
    mutate(end_time_dbl = as.numeric(end_hour) + (as.numeric(end_minute)/60))
# get the player count at given date and time for each observation by checking if date is the same and
# that the start time is inside the interval of (start time, end time) of any other observation
 sessions_count <- sessions_wrangled |>
    rowwise() |>
    mutate(player_count = sum(sessions_wrangled$start_date == start_date & 
      sessions_wrangled$start_time_dbl <= start_time_dbl & 
      sessions_wrangled$end_time_dbl >= start_time_dbl)) |>
# select only important columns
    select(player_count, weekday, month, start_time_dbl)
head(sessions_count, 3)

# Summary of the Data

In [None]:
month_plot <- sessions_count |>
    ggplot(aes(x = month)) +
    geom_bar() +
    labs(x = "Month", y = "Total Player Count")
month_plot

In [None]:
weekday_plot <- sessions_count |>
    ggplot(aes(x = weekday) +
    geom_bar() +
    labs(x = "Day of the Week", y = "Total Player Count")
weekday_plot

# ignore this 
# ggplot(aes((x = reorder(weekday,weekday, function(x)-length(x))))) 

# Visualization of the dataset

# Data Analysis

To analyse the data, we'll try and create a linear regression model to predict the player count at a given time using the weekday, month and the time of day as predictors.

First we'll split 

In [None]:
set.seed(1111) # DO NOT REMOVE
sessions_split <- sessions_count |>
    initial_split(prop = 0.6, strata = player_count)

sessions_training <- training(sessions_split)
sessions_testing <- testing(sessions_split)

In [None]:
set.seed(1111) # DO NOT REMOVE

sessions_spec <- linear_reg() |>
    set_engine("lm") |>
    set_mode("regression")

sessions_recipe <- recipe(player_count ~ ., data = sessions_training)

sessions_fit <- workflow() |>
    add_recipe(sessions_recipe) |>
    add_model(sessions_spec) |>
    fit(data = sessions_training)

In [None]:
set.seed(1111) # DO NOT REMOVE

sessions_rmspe <- sessions_fit |>
         predict(sessions_testing) |>
         bind_cols(sessions_testing) |>
         metrics(truth = player_count, estimate = .pred) |>
         filter(.metric == "rmse") |>
         select(.estimate) |>
         pull()

sessions_rmspe

# Visualization of the Analysis 

 options(repr.plot.height = 8, repr.plot.width = 12)
 sessions_pairplot <- sessions_training |> 
     ggpairs(mapping = aes(alpha = 0.4)) +
     theme(text = element_text(size = 20))

sessions_pairplot

this causes the kernel to crash, might not use it

find patterns with month and date using geom point

# References

Ressources to help with dealing with the date format

https://www.geeksforgeeks.org/convert-date-to-day-of-week-in-r/

https://stackoverflow.com/questions/7439977/changing-date-format-in-r