You can write notes & updates here:

.
.



In [None]:
#loading libraries & data files
library(tidyverse)
library(tidymodels)
library(lubridate)

players_data <- read_csv("https://raw.githubusercontent.com/achan919/dsci-final-project/refs/heads/main/players.csv")
sessions_data <- read_csv("https://raw.githubusercontent.com/achan919/dsci-final-project/refs/heads/main/sessions.csv")

In [None]:
#tidying data
#tidy data for players data
players_data <- mutate(players_data, experience = as.factor(experience))
players_data <- mutate(players_data, gender = as.factor(gender))

# tidy date data
sessions_data_time <- sessions_data |>
    mutate(original_end_time = as.numeric(as.POSIXct(end_time, format = "%d/%m/%Y %H:%M")),
           original_start_time = as.numeric(as.POSIXct(start_time, format = "%d/%m/%Y %H:%M")))

# calculate session length from start and end time (in seconds)
sessions_data_plustime <- sessions_data_time |>
  mutate(time = original_end_time - original_start_time)

# convert to numeric diff_time
sessions_data_numeric_time <- sessions_data_plustime |>
  mutate(diff_time = as.numeric(time))

#actual tidy data for use
sessions_data_tidy <- select(sessions_data_numeric_time, hashedEmail, start_time, end_time, diff_time)

Merging both players and sessions data sets 

In [None]:
#merged all data between sessions and tidy data
merged_all_data <- merge(sessions_data_tidy, players_data, by = "hashedEmail", all = T)

#merged data for people with at least one session
merged_sessions_data <- filter(merged_all_data,!is.na(start_time))

#data for people with no sessions (people in players data, but not sessions data)
merged_nosessions_data <- filter(merged_all_data, is.na(start_time))
head(merged_sessions_data)

tidying sessions data so we know total number of sessions and total time spent playing for each player by name

In [None]:
#tidying sessions
sessions_names_only <- select(merged_sessions_data, name, diff_time)

#making new variables number of sessions, total time, mean time & median time
sessions_count <- sessions_names_only |> count(name) 
total_time <- aggregate(.~name, data = sessions_names_only, FUN=sum)
total_time_summary_stats <- sessions_names_only|>
    group_by(name) |>
    summarize(mean_time= mean(diff_time),median_time= median(diff_time))

#only the variables name, sessions_num, total_time,mean_time, median_time
sessions_by_name<- bind_cols(sessions_count, total_time,total_time_summary_stats )|> 
#head(sessions_by_name)  -- use this to confirm all names match up
    mutate(sessions_num=n, total_time=diff_time,name=name...1)|>
    select(name,sessions_num, total_time,mean_time,median_time)

#players data for all players with sessions with additional variable number of sessions, total time, mean time & median time
sessions_allvars <- merge(sessions_by_name,players_data, by = "name", all.x = T)
head(sessions_allvars)