In [None]:
library(tidyverse)

players <- read_csv("data/players.csv")
sessions <- read_csv("data/sessions.csv")
glimpse(players)
glimpse(sessions)

In [None]:
# Wrangling - Clean players dataset
players_clean <- players |>
    filter(!is.na(subscribe), !is.na(played_hours), played_hours > 0) |>
    mutate(subscribe = factor(subscribe), gender = factor(gender), experience = factor(experience))
glimpse(players_clean)

In [None]:
# Wrangling - Clean sessions dataset
sessions_clean <- sessions |>
    separate(start_time, into = c("start_date", "start_clock"), sep = " ") |>
    separate(end_time, into = c("end_date", "end_clock"), sep = " ") |>
    separate(start_clock, into = c("start_hour", "start_minute"), sep = ":", convert = TRUE) |>
    separate(end_clock, into = c("end_hour", "end_minute"), sep = ":", convert = TRUE) |>
    mutate(duration_min = (end_hour - start_hour) * 60 + (end_minute - start_minute)) |>
    filter(duration_min > 0)
glimpse(sessions_clean)


In [None]:
# Summarize sessions per player
sessions_summary <- sessions_clean |>
    group_by(hashedEmail) |>
    summarize(n_sessions = n(), mean_session_min = mean(duration_min))
glimpse(sessions_summary)

In [None]:
# Join players + session summaries
players_final <- players_clean |>
    left_join(sessions_summary, by = "hashedEmail")
glimpse(players_final)

In [None]:
# Summary stats

# Overall
players_final |>
    summarize(
        mean_hours = mean(played_hours),
        median_hours = median(played_hours),
        sd_hours = sd(played_hours),
        subscription_rate = mean(subscribe == "TRUE"))

# Subscription rate by experience
players_final |>
    group_by(experience) |>
    summarize(subscription_rate = mean(subscribe == "TRUE"))

# Subscription rate by gender
players_final |>
    group_by(gender) |>
    summarize(subscription_rate = mean(subscribe == "TRUE"))

In [None]:
options(repr.plot_height = 20, repr.plot_width = 20)
ggplot(players_final, aes(x = played_hours)) +
    geom_histogram(binwidth = 5, fill = "blue", color = "white") +
    labs(x = "Played Hours", y = "Count", title = "Distribution of Player Hours") +
    theme(text = element_text(size = 20))

In [None]:
options(repr.plot_height = 20, repr.plot_width = 20)
ggplot(players_final, aes(x = played_hours)) +
    geom_histogram(binwidth = 1, fill = "blue", color = "white") +
    coord_cartesian(xlim = c(0, 10)) +
    labs(x = "Played Hours", y = "Count", title = "Distribution of Player Hours") +
    theme(text = element_text(size = 20))