In [1]:
# Cell 2: Data Preparation

install.packages(c("tidyverse", "lubridate", "tidymodels"))

library(tidyverse)
library(lubridate)
library(recipes)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘recipes’


The following object is masked from ‘package:stringr’:

    fixed


The following object is masked from ‘package:stats’:

    step




In [2]:
print("Starting data preparation...")

# --- 1. Load Data ---
# MODIFIED: Paths now point to the 'data/' subfolder
tryCatch({
    players <- read_csv("data/players.csv")
    sessions <- read_csv("data/sessions.csv")
}, error = function(e) {
    stop("ERROR: Make sure you have a 'data' folder containing 'players.csv' and 'sessions.csv'.")
})

[1] "Starting data preparation..."


[1mRows: [22m[34m196[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, age
[33mlgl[39m (3): subscribe, individualId, organizationName

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this messag

In [3]:
# --- 2. Wrangle & Aggregate Sessions ---
# Convert times and calculate session length in hours
sessions_data <- sessions |>
  mutate(
    start_time = dmy_hm(start_time, tz = "UTC"),
    end_time = dmy_hm(end_time, tz = "UTC"),
    session_length_hours = as.numeric(end_time - start_time) / 3600
  ) |>
  filter(!is.na(session_length_hours)) # Drop invalid rows

In [4]:
# Aggregate by player
sessions_agg <- sessions_data |>
  group_by(hashedEmail) |>
  summarize(
    total_sessions = n(),
    total_played_hours = sum(session_length_hours, na.rm = TRUE),
    average_session_length = mean(session_length_hours, na.rm = TRUE)
  )

In [5]:
# --- 3. Join & Clean ---
players_clean <- players |>
  select(hashedEmail, experience, subscribe, gender, age)

combined_data <- players_clean |>
  left_join(sessions_agg, by = "hashedEmail") |>
  # Replace NAs with 0 for players who have no session data
  mutate(
    total_sessions = ifelse(is.na(total_sessions), 0, total_sessions),
    total_played_hours = ifelse(is.na(total_played_hours), 0, total_played_hours),
    average_session_length = ifelse(is.na(average_session_length), 0, average_session_length)
  ) |>
  # Drop rows with NAs in key columns we need for clustering
  drop_na(age, experience)

In [6]:
# --- 4. K-Means Clustering ---
features_for_clustering <- c("age", "total_played_hours", "total_sessions", "average_session_length")
cluster_data <- combined_data |> select(all_of(features_for_clustering))

# Scale the data before clustering
scaler <- preProcess(cluster_data, method = c("center", "scale"))
cluster_data_scaled <- predict(scaler, cluster_data)

# Run K-Means
set.seed(42) # for reproducibility
kmeans_model <- kmeans(cluster_data_scaled, centers = 4, nstart = 25)

ERROR: Error in preProcess(cluster_data, method = c("center", "scale")): could not find function "preProcess"


In [None]:
# --- 5. Add Persona Names & Save ---
all_data_final <- combined_data |>
  mutate(
    cluster = kmeans_model$cluster,
    # Assign names based on the cluster (you can inspect kmeans_model$centers)
    persona = as.factor(case_when(
      cluster == 1 ~ "Casual Players",
      cluster == 2 ~ "Deep Divers",
      cluster == 3 ~ "Anomalies",
      cluster == 4 ~ "Power Users"
    ))
  )

# Save the final, clean file to the MAIN folder
write_csv(all_data_final, "minecraft_dashboard_data.csv")

print("Success! 'minecraft_dashboard_data.csv' is ready.")
print("You can now open your Python notebook '02_Dashboard.ipynb'.")