In [118]:
install.packages("caret")

Installing package into ‘/usr/local/lib/R/3.5/site-library’
(as ‘lib’ is unspecified)
also installing the dependencies ‘numDeriv’, ‘SQUAREM’, ‘lava’, ‘prodlim’, ‘iterators’, ‘gower’, ‘ipred’, ‘timeDate’, ‘foreach’, ‘ModelMetrics’, ‘recipes’



In [119]:
library(dplyr)
library(plsRglm)
library(caret)

Loading required package: lattice
Loading required package: ggplot2


In [108]:
get_default_hyperparameters <- function(){
    hyperparameters <- list()
    attr(hyperparameters, "ensemble_size") <- 2
    attr(hyperparameters, "n_attempts") <- 3
    attr(hyperparameters, "single_model_K") <- 10
    attr(hyperparameters, "pls_components") <- 2
    attr(hyperparameters, "pls_model") <- "pls-glm-gaussian"
    attr(hyperparameters, "use_wvc") <- TRUE
    attr(hyperparameters, "retest_predictors") <- FALSE
    return(hyperparameters)
}

In [135]:
#this function does all preprocessing (normalization etc.)
#and defines new features by applying transformations
preprocess <- function(predictors, hyperparameters=get_default_hyperparameters()){
    
    znorm <- function(x)  (x - mean(x))/ sd(x)
    predictors <- apply(predictors, 2, znorm)
    
    return(predictors)
    
}

In [None]:
run_ensemble_stacked <- function(ensemble){
    
}

In [159]:
run_ensemble <- function(ensemble_models, predictors, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    ensemble_size <- length(ensemble_models)
    predictions <- vector("list", ensemble_size)
    for (n in 1:ensemble_size){
        if(verbose){
            print(paste("running model", n, "/", ensemble_size, sep=" "))
        }
        predictions[[n]] <- predict(ensemble_models[[n]], newData=predictors)
    }
    predictions <- t(Reduce(rbind, predictions))
    print(predictions)
    return(predictions)
}

In [150]:
train_ensemble <- function(ensemble_definition, predictors, response, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    ensemble_size <- length(ensemble_definition)
    ensemble_models <- vector("list", ensemble_size)
    n_comp <- attr(hyperparameters, "pls_components")
    modele <- attr(hyperparameters, "pls_model")
    
    for (n in 1:ensemble_size){
        if(verbose){
            print(paste("training model", n, "/", ensemble_size, sep=" "))
        }
        train_X <- predictors[,ensemble_definition[[n]]]
        train_Y <- response
        ensemble_models[[n]] <- plsRglm(train_Y, train_X, nt=n_comp, modele=modele, verbose=FALSE)
    }
    
    return(ensemble_models)
}

In [151]:
find_ensemble <- function(predictors, response, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    
    ensemble_size <- attr(hyperparameters, "ensemble_size")
    ensemble = vector("list", ensemble_size)
    
    for (n in 1:ensemble_size){
    
        if(verbose){
            print(paste("finding model", n, "/", ensemble_size, sep=" "))
        }
        ensemble[[n]] <- find_single_model(predictors, response, hyperparameters=hyperparameters, verbose=verbose)
    }
    return(ensemble)
}

In [153]:
#find a single model
#to serve as part of an ensemble
#return set of predictors used for that model
find_single_model <- function(predictors, response, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    K <- attr(hyperparameters, "single_model_K")
    CV_folds   <- get_CV_folds(nrow(predictors), K)
    train_indices <- CV_folds[[1]]
    test_indices <- CV_folds[[2]]
    n_attempts <- attr(hyperparameters, "n_attempts")
    n_comp <- attr(hyperparameters, "pls_components")
    modele <- attr(hyperparameters, "pls_model")
    retest <- attr(hyperparameters, "retest_predictors")
    use_wvc <- FALSE#attr(hyperparameters, "use_wvc")
    
    if(use_wvc){
        plsRglm_func = PLS_glm_wvc
    }
    else{
        plsRglm_func = plsRglm
    }
    
    predictor_set <- c()
    best_predictor_set <- c()
    tested_predictors <- c()
    best_score <- Inf
    
    for(n in 1:n_attempts){
        
        
        current_predictor_set <- best_predictor_set
        
        #for the first iteration, run the empty model
        #to get performance of intercept
        #after first iteration, add predictors to model
        if(n > 1){
            all_predictors <- colnames(predictors)
            unused_predictors <- c()
            for(j in 1:length(all_predictors)){
                p <- all_predictors[[j]]
                if(!( p %in% current_predictor_set) && (retest || !(p %in% tested_predictors))){
                    unused_predictors <- c(unused_predictors, p)
                }
            }
            p <- sample(unused_predictors, 2, replace=FALSE)
            current_predictor_set <- c(current_predictor_set, p)
            tested_predictors <- p
        }
        if(verbose){
            print(paste("Evaluating model:", paste(current_predictor_set, sep=" ", collapse=", "), sep=" "))
        }
        
        oof_scores <- rep(0, K)
        
        for(k in 1:K){
            if(length(current_predictor_set)==0){
                #empty model - just use mean as intercept
                train_Y <- response[train_indices[[k]]]
                test_Y  <- response[test_indices[[k]]] 
                prediction <- mean(train_Y)
                oof_scores[k] <- mean(abs(test_Y - prediction))
            }
            else{
                train_X <- predictors[train_indices[[k]],current_predictor_set]
                test_X  <- predictors[test_indices[[k]],current_predictor_set]
                train_Y <- response[train_indices[[k]]]
                test_Y  <- response[test_indices[[k]]] 

                modplsglm  <- plsRglm_func(train_Y, train_X, dataPredictY=test_X, 
                                           nt=n_comp, modele=modele, verbose=FALSE)
            
                oof_scores[k] <- mean(abs(test_Y - modplsglm$ValsPredictY))
            }
        }
        
        
        #if the model is an improvement over previous best, update it
        final_score <- mean(oof_scores)
        
        if(final_score < best_score){
            output_message = "Keeping model"
            best_score <- final_score
            best_predictor_set <- current_predictor_set
        }
        else{
            output_message = "Discarding model"
        }
        if(verbose){
            print(paste("Model score =", final_score, sep=" "))
            print(output_message)
        }
    }

    
    #return description of this model
    #as a list of predictor names (strings)
    return(best_predictor_set)
}

In [160]:
dev_script()

[1] 1 2
[1] "top-level CV fold  1"
[[1]]
[1] "fixed.acidity" "file.number=1" "citric.acid"   "sulphates"    

[[2]]
[1] "citric.acid"          "density"              "free.sulfur.dioxide" 
[4] "total.sulfur.dioxide"

         init         
1    5.717825 6.121781
2    5.799923 5.380052
3    5.907681 6.183749
4    5.731008 5.647665
5    5.659370 5.953233
6    5.901808 6.211812
7    5.801618 5.698197
8    6.003457 6.378650
9    5.866710 6.083473
10   5.810899 5.625581
11   6.083244 5.984851
12   6.042557 5.837471
13   5.881462 6.238999
14   5.832617 6.069918
15   5.788808 5.728619
16   5.583590 5.424622
17   5.676230 5.548477
18   5.913148 5.482490
19   5.879766 6.344506
20   5.584338 5.505817
21   5.556904 5.648567
22   6.163410 6.107148
23   5.607707 5.414077
24   5.484043 5.435368
25   5.820474 5.799029
26   5.976312 5.943026
27   5.543369 5.564617
28   5.992178 5.662057
29   5.495470 5.257261
30   5.906593 6.109881
31   5.688915 5.877862
32   5.680360 5.950845
33   5.938367 5.717773
3

NULL

In [5]:
find_or_download_data <- function (main_url, files){

    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find", files[i], "- attempting to download", sep=" " ))
            download.file(paste(main_url, files[i],sep="") , files[i], "auto", quiet = FALSE)
        }
    }
    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find or download", files[i], "- training will fail", sep=" " ))
            return(0)
        }
    }
    return(1)
}

In [6]:
get_CV_folds <- function(N, K){
    train_indices <- vector("list", K)
    test_indices  <- vector("list", K)
    random_permutation <- sample(N, N, replace=FALSE)
    for (i in 1:K){
        start <- floor(N*(i-1)/K) + 1
        stop  <- floor(N*i/K)
        test_indices[[i]] <-  random_permutation[start:stop]
        train_indices[[i]] <- random_permutation[-(start:stop)]
    }
    return(list(train_indices, test_indices))
}

In [155]:
#script for evaluating different approaches
dev_script <- function(){
    main_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
    files <- c("winequality-red.csv", "winequality-white.csv")
    response_name <- "quality"
    K <- 5
    #number of folds for topmost level of cross-validation
    
    have_data <- find_or_download_data(main_url, files)
    if(!have_data){
        print("Could not find or download data - returning null")
        return(NULL)
    }
    else{
        single_datasets <- vector("list", length(files))
        for (i in 1:length(files)){
            single_datasets[[i]] <- read.csv(files[i], header=TRUE, sep=";")
            single_datasets[[i]]$file.number <- i
        }
        data <- cbind(single_datasets)
        data <- bind_rows(single_datasets)
        #data = full dataset (merged across all files)
        
        #make one-hot encoding of file number
        u <- unique(data$file.number)
        print(u)
        for (val in u[1:(length(u)-1)]){
            data[paste("file.number=",val,sep="")] <- (data$file.number==val)*1.0
        }
        data$file.number <- NULL
        
        response <- data[[response_name]]
        predictors <- data
        predictors[[response_name]] <- NULL
        
        
        predictors <- preprocess(predictors)
        
        CV_indices <- get_CV_folds(nrow(data), K)
        train_indices <- CV_indices[[1]]
        test_indices  <- CV_indices[[2]]
        
        for (i in 1:K){
            print(paste('top-level CV fold ', i, sep=' '))
            #first step: find an ensemble of models
            #by searching the space of possible models
            #(this is the space of 2^P possible subsets of all predictors)
            ensemble_definitions <- find_ensemble(predictors[train_indices[[i]],], response[train_indices[[i]]])
            print(ensemble_definitions)
            #now train the ensemble on the same data
            ensemble_models <- train_ensemble(ensemble_definitions, predictors[train_indices[[i]],], response[train_indices[[i]]])
            predictions <- run_ensemble(ensemble_models, predictors[test_indices[[i]],])
        }
        
        
    }
    return(NULL)
}