added workflow script

geco-bern · May 22, 2023 · b263444 · b263444
1 parent 3fc0364
commit b263444
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 0 deletions.
diff --git a/add_material/full_workflow_no_caret.R b/add_material/full_workflow_no_caret.R
@@ -0,0 +1,151 @@
+# This script is used to demonstrate the implementation of a full worflow "by hand".
+# Used in Care Session 2.
+
+set.seed(1982)
+
+# Read and wrangle data ---------------------
+# (no pre-processing!). Here, no wrangling done (cleaning, variable selection, etc.)
+# df <- read.csv(paste0(here::here(), "/data/df_daily_exercise_supervisedmlii.csv"))
+
+# alternative
+df <- read_csv(paste0(here::here(), "/data/FLX_CH-Dav_FLUXNET2015_FULLSET_DD_1997-2014_1-3.csv")) |>  
+
+  # select only the variables we are interested in
+  dplyr::select(TIMESTAMP,
+                GPP_NT_VUT_REF,    # the target
+                ends_with("_QC"),  # quality control info
+                ends_with("_F"),   # includes all all meteorological covariates
+                -contains("JSB")   # weird useless variable
+  ) |>
+
+  # convert to a nice date object
+  dplyr::mutate(TIMESTAMP = lubridate::ymd(TIMESTAMP)) |>
+
+  # set all -9999 to NA
+  dplyr::na_if(-9999) |> # NOTE: Newer tidyverse version no longer support this statement
+  # instead, use `mutate(across(where(is.numeric), ~na_if(., -9999))) |> `
+
+  # retain only data based on >=80% good-quality measurements
+  # overwrite bad data with NA (not dropping rows)
+  dplyr::mutate(GPP_NT_VUT_REF = ifelse(NEE_VUT_REF_QC < 0.8, NA, GPP_NT_VUT_REF),
+                TA_F           = ifelse(TA_F_QC        < 0.8, NA, TA_F),
+                SW_IN_F        = ifelse(SW_IN_F_QC     < 0.8, NA, SW_IN_F),
+                LW_IN_F        = ifelse(LW_IN_F_QC     < 0.8, NA, LW_IN_F),
+                VPD_F          = ifelse(VPD_F_QC       < 0.8, NA, VPD_F),
+                PA_F           = ifelse(PA_F_QC        < 0.8, NA, PA_F),
+                P_F            = ifelse(P_F_QC         < 0.8, NA, P_F),
+                WS_F           = ifelse(WS_F_QC        < 0.8, NA, WS_F)) |> 
+
+  # drop QC variables (no longer needed)
+  dplyr::select(-ends_with("_QC")) |> 
+  drop_na()
+
+# Split data into train and test ---------------------
+
+# Specify formula and pre-processing recipe ---------------------
+# (allocate “roles” to variables)
+nam_target <- "GPP_NT_VUT_REF"
+nams_predictors <- c("TA_F", "SW_IN_F", "VPD_F")
+
+# Hyperparameter loop ---------------------
+vec_k <- c(2, 5, 10, 15, 20, 25, 30, 35, 40, 60, 100, 200, 300)
+
+# specify number of cross-validation folds
+n_folds <- 5
+
+# initialise loss across resamples, to be kept for each hyperparameter choice
+loss_avg <- c()
+
+for (i_k in seq(length(vec_k))){
+
+  # re-shuffle rows in data frame (not required)
+  df <- df[sample(nrow(df)),]
+
+  # create folds:
+  # determine row indices to be allocated to each fold
+  # each fold takes in 1/n_folds of the total number of rows
+  nrows_per_fold <- ceiling(nrow(df) / n_folds)
+  idx <- rep(seq(1:n_folds), each = nrows_per_fold)
+  resample_folds <- split(1:nrow(df), idx[1:nrow(df)])  # yields a list of vectors containing the row indices
+
+  ## resample loop ---------------------
+  # initialise vector of loss per resample
+  loss_vec <- c()
+
+  for (i_resample in seq(n_folds)){
+
+    ## Split into validation and (remaining) train set ---------------------
+    # validation set
+    df_valid <- df[resample_folds[[i_resample]], c(nam_target, nams_predictors)]
+
+    ## remaining training set
+    df_train <- df[-resample_folds[[i_resample]], c(nam_target, nams_predictors)]
+
+    ## Apply pre-processing ---------------------
+    # center and scale based on training data parameters
+    mean_byvar <- c()
+    sd_byvar <- c()
+    df_train_cs <- df_train * NA
+    df_valid_cs <- df_valid * NA
+
+    # center and scale each predictor
+    for (ivar in nams_predictors){
+
+      # determine mean and sd for centering and scaling
+      mymean   <- mean(df_train[[ivar]], na.rm = TRUE)
+      mysd     <- sd(df_train[[ivar]], na.rm = TRUE)
+
+      # center and scale training data
+      df_train_cs[,ivar] <- df_train[[ivar]] - mymean
+      df_train_cs[,ivar] <- df_train_cs[[ivar]] / mysd
+
+      # center and scale validation data 
+      # important: use parameters (mean and sd) determined on training data
+      df_valid_cs[,ivar] <- df_valid[[ivar]] - mymean
+      df_valid_cs[,ivar] <- df_valid_cs[[ivar]] / mysd
+    }
+
+    # add unmodified target variable
+    df_valid_cs[,nam_target] <- df_valid[[nam_target]] 
+    df_train_cs[,nam_target] <- df_train[[nam_target]] 
+
+    ## Fit model on train set ---------------------
+    # train using the scaled training data
+    mod <- caret::knnreg(df_train_cs[,nams_predictors], 
+                         df_train_cs[,nam_target], 
+                         k = vec_k[i_k]
+                         )
+
+    ## Predict on validation set ---------------------
+    # on the scaled validation set!
+    df_valid_cs$pred <- predict(mod, newdata = df_valid_cs[,nams_predictors])
+
+    ## Get loss from prediction on validation set ---------------------
+    # Here, calculate mean absolute error on validation data
+    loss_vec[i_resample] <- mean(abs(df_valid_cs$pred - df_valid_cs[,nam_target]), na.rm = TRUE)
+
+  }
+
+  # Average loss across resamples ---------------------
+  loss_avg[i_k] <- mean(loss_vec, na.rm = TRUE)
+
+}
+
+# Pick best hyperparameter based on average loss across resamples ---------------------
+best_k <- vec_k[which.min(loss_avg)]
+
+# print result
+print(paste("The best choice of hyperparameter k is:", best_k))
+
+# visualise
+library(ggplot2)
+df_plot <- tibble(
+  k = vec_k,
+  loss = loss_avg
+  )
+
+ggplot(aes(k, loss), data = df_plot) +
+  geom_point() +
+  geom_line() +
+  theme_classic() +
+  labs(title = "Validation loss", subtitle = "Mean absolute error, mean across folds")
diff --git a/add_material/mypseudo_solutions_stepwise-regression.Rmd b/add_material/mypseudo_solutions_stepwise-regression.Rmd
@@ -0,0 +1,50 @@
+This was written in class for demonstration. No guarantee that this is correct ;-).
+
+```{r}
+# specify the target
+
+# 
+
+# initialise vector of selected variables
+all_vars_selected <- c()
+
+
+for (pred in preds_all){
+
+
+	# 
+
+	# initialise candidate models
+	linmod_candidate <- list()
+
+	for (pred_candidate in remaining_candidates){
+
+		# specify model formula
+
+		# fit model
+
+		# extract R-squared
+
+
+	}
+
+	# determine highest R-squared and corresponding variable name -> var_selected, linmod_current
+
+
+	# update vector of selected variables
+	all_vars_selected <- c(all_vars_selected, var_selected)
+
+	# drop selected variable (var_selected) from candidate predictors (remaining_candidates)
+	remaining_candidates <- remaining_candidates[-which(remaining_candidates==var_selected)]
+
+	# determine whether AIC improved
+	AIC_current <- AIC(linmod_current)
+
+	# if it hasn't improved, quit
+	if (AIC_current > AIC_last) break()
+
+}
+
+
+
+```