loads data 
wrangles and cleans the data to the format necessary for the planned analysis
tidy_sessions <- sessions |>
         mutate(start_time = as.POSIXct(start_time, format = "%d/%m/%Y %H:%M", tz = "UTC"),
         hour_of_day = hour(start_time),
         month_of_year = month(start_time, label = TRUE),
         week_of_year = week(start_time)) |>
         mutate(end_time = as.POSIXct(end_time, format = "%d/%m/%Y %H:%M", tz = "UTC")) |>
        			 separate (col = start_time,
				     into = c("date","start_time"),
				     sep = " ") |>
                     separate (col = end_time,
				     into = c("date","end_time"),
				     sep = " ")

performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
performs the data analysis
creates a visualization of the analysis 


# Loading the Data

In [35]:
library(tidyverse)

In [3]:
sessions <- "data/sessions.csv" |>
    read_csv()

[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
numrows <- sessions |>
    nrow()
sprintf("Number of Observations: %s", numrows)

# Wrangle and Cleaning the Data

Using the previously loaded in sessions file, the data we'll have to be tidied so that it can be used more effectively later.

To start, we'll rename the hashedEmail column to be consistent with the other columns and by separate starting and ending date and time into their own columns.

In [36]:
sessions_tidy <- sessions |>
    rename(hashed_email = hashedEmail, start_time1 = start_time, end_time1 = end_time) |> # temporary names added to start and end time to prevent errors in seperate
    separate(start_time1, into = c('start_date', 'start_time'), sep = ' ') |>
    separate(end_time1, into = c('end_date', 'end_time'), sep = ' ')
head(sessions_tidy, 5)

hashed_email,start_date,start_time,end_date,end_time,original_start_time,original_end_time
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,30/06/2024,18:12,30/06/2024,18:24,1719770000000.0,1719770000000.0
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,17/06/2024,23:33,17/06/2024,23:46,1718670000000.0,1718670000000.0
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,25/07/2024,17:34,25/07/2024,17:57,1721930000000.0,1721930000000.0
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,25/07/2024,03:22,25/07/2024,03:58,1721880000000.0,1721880000000.0
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,25/05/2024,16:01,25/05/2024,16:12,1716650000000.0,1716650000000.0
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,23/06/2024,15:08,23/06/2024,17:10,1719160000000.0,1719160000000.0
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,15/04/2024,07:12,15/04/2024,07:21,1713170000000.0,1713170000000.0
ad6390295640af1ed0e45ffc58a53b2d9074b0eea694b16210addd44d7c81f83,21/09/2024,02:13,21/09/2024,02:30,1726880000000.0,1726890000000.0
96e190b0bf3923cd8d349eee467c09d1130af143335779251492eb4c2c058a5f,21/06/2024,02:31,21/06/2024,02:49,1718940000000.0,1718940000000.0
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,16/05/2024,05:13,16/05/2024,05:52,1715840000000.0,1715840000000.0


Great, next we'll have to convert start and end date and time to a useable form. We'll do that by creating a new column called weekdays and a column called month, made by converting our date to a Date type and then we'll convert our time to a numerical value. Finally, we'll create a length (in minutes) column by subtracting the start and end times.

In [45]:
sessions_wrangled <- sessions_tidy |>
# get weekdays
    mutate(weekday = as.factor(weekdays(strptime(as.character(start_date), "%d/%m/%Y")))) |>
# convert start_time
    separate(start_time, into = c('start_hour', 'start_minute'), sep = ':') |>
    mutate(start_time_dbl = as.numeric(start_hour) + (as.numeric(start_minute)/60))|>
# convert end_time
    separate(end_time, into = c('end_hour', 'end_minute'), sep = ':') |>
    mutate(end_time_dbl = as.numeric(end_hour) + (as.numeric(end_minute)/60))|>
# get length of session in minutes
    mutate(length = (end_time_dbl - start_time_dbl) * 60) |>
# select only important columns
    select(weekday, start_time_dbl, end_time_dbl, length)
head(sessions_wrangled, 5)

weekday,start_time_dbl,end_time_dbl,length
<fct>,<dbl>,<dbl>,<dbl>
Sunday,18.2,18.4,12
Monday,23.55,23.766667,13
Thursday,17.566667,17.95,23
Thursday,3.366667,3.966667,36
Saturday,16.016667,16.2,11


# Summary of the Data

# Visualization of the dataset

# Data Analysis

# Visualization of the Analysis 