In [None]:
library(tidyverse)
library(caret)
library(doParallel)
library(xtable)

In [None]:
cl <- makePSOCKcluster(16)
registerDoParallel(cl)

### Data Loading

In [None]:
source("data_load.R")

In [None]:
path <- "../data"
files <- paste0(path, "/", list.files(path = path, pattern = ".csv"))
df <- load_from_path("../data") %>% 
    mutate(multi = factor(multi)) %>%
    filter(max_peak %in% c(38,40,42,45,50,60))

In [None]:
set.seed(1)

In [None]:
N_TRAINING_DAYS = 10*7
N_RESAMPLES = 20

In [None]:
train_cols <- c("C", 'q0','q10','q25','q50','q75','q90','q100','mean','std','var',
                "fft1", "fft2", "fft3", "fft4", "fft5", "fft6", "fft7", "fft8",
                "fft1n", "fft2n", "fft3n", "fft4n", "fft5n", "fft6n", "fft7n", "fft8n")

In [None]:
gbm <- function(train.x, train.y){
    fitControl <- trainControl(method = "repeatedcv", number=5, repeats=2)
    train(x = train.x, y = train.y, method = "gbm", trControl = fitControl, verbose=F)
}

In [None]:
# train.x <- df_train[, train_cols] %>% as.data.frame()
# train.y <- df_train %>% .[["R"]]
# trained_model <- gbm(train.x, train.y)

In [None]:
# For one specific parameter combination (of max_peak, ...)
train_resamples <- function(x, R, C_reference, m){
    train_splits <- createDataPartition(1:nrow(R),
                                      times = N_RESAMPLES,
                                      p = N_TRAINING_DAYS/nrow(x),
                                      list=T)
    print(train_splits)
    improvements <- lapply(train_splits, function(train) {
            trained_model <- m(x[train,], R[train,]) # train data on the partition, test on the non-partition
            
            predictions <- predict(trained_model, x[-train,])
            
            C_real <- max(C_reference[-train]) # maximum C_reference on test split
            C_non_corrected <- max(x[-train,]$C) # maximum C calculated on test split
            C_corrected <- max(x[-train,]$C - predictions) # maximum corrected C on test split
            error_non_corrected <- C_real - C_non_corrected
            error_corrected <- C_real - C_corrected
            improvement <- abs(error_non_corrected) - abs(error_corrected)
            improvement
        }
     )
    improvements
}

In [None]:
train_on_df_group <- function(df_group){
    train.x <- df_group[, train_cols] %>% as.data.frame()
    train.y <- df_group %>% select(R) %>% as.data.frame()
    train_resamples(train.x, train.y, df_group$C_reference, gbm)
}

In [None]:
df_subset <- df %>% 
    select(-day, -multi) %>%
    filter(max_peak %in% c(40, 38), aggregation_interval %in% c(300, 900, 1800, 3600), !is.na(C))

In [None]:
results <- df_subset %>%
    group_by(max_peak, aggregation_interval, aggregation_type) %>%
    nest() %>%
    mutate(improvements = map(data, train_on_df_group))

In [None]:
results %>% select(max_peak, aggregation_interval, aggregation_type, improvements) %>%
    unnest() %>%
    mutate(improvements = as.numeric(improvements)) %>%
    group_by(max_peak, aggregation_interval, aggregation_type) %>%
    summarize(median_improvement = median(improvements),
              max_improvement = max(improvements)) %>%
    gather(variable, value, median_improvement, max_improvement) %>%
    unite(tmp, aggregation_type, variable) %>%
    spread(tmp, value) -> table_var

In [None]:
table_var

In [None]:
print(xtable(table_var),
      include.rownames=FALSE,
      include.colnames = FALSE,
      only.contents = TRUE, 
      booktabs = TRUE, 
      hline.after = 4,
      file = "export/table_model_comparison.tex")