In [5]:
library(dplyr)
library(ggplot2)
library(GGally)
library(reshape2)
library(gridExtra)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘GGally’

The following object is masked from ‘package:dplyr’:

    nasa


Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine



This notebook contains R code for performing classification with ensembles of polynomial regression models. The procedure for generating and running an ensemble is as follows:

Split data into cross-validation folds (e.g. 5 folds)
    Set one fold aside as ensemble hold-out, rest are ensemble development data
    Initialize ensemble as empty list
    For each model in ensemble:
        Initialize model (e.g. all variables present at power 1, all interactions present)
        For each step of model improvement:
            If the model has not been evaluated before: leave it unchanged, so the next steps evaluate baseline                       performance  
            Else: Make a random change to the model (change the poynomial degree of a variable, or add/drop an                       interaction)
            Split ensemble development data into cross-validation folds (e.g. 10 folds), 
                set one fold aside as model hold-out, rest are training data
            Fit model to training data, evaluate on model hold-out data
            If performance on model hold-out data is better than previous version of model, keep the change
       Add model to ensemble
   Fit each model in ensemble to entire development data
   Compute prediction of each model in ensemble on ensemble hold-out
   Take mean of these predictions -> this is ensemble prediction on hold-out
   
            
Note that we develop a different ensemble for each fold of the cross-validation folds.


In [71]:
#script for defining settings/hyperparameters and running ensemble classification
hp <- get_default_hyperparameters()

attr(hp, "verbose") <- TRUE

attr(hp, "test_initial_model") <- TRUE
errors_1 <- evaluate_ensemble(hp)
attr(hp, "ensemble_size") <- 1
attr(hp, "top_level_repeats") <- 10
attr(hp, "test_initial_model") <- FALSE
errors_2 <- evaluate_ensemble(hp)

print(paste("mean error of baseline model   = ", mean(errors_1), sep=" "))
print(paste("mean error of ensemble         = ", mean(errors_2), sep=" "))
print(paste("SEM of error of baseline model = ", sd(errors_1)/sqrt(length(errors_1)), sep=" "))
print(paste("SEM of error of ensemble       = ", sd(errors_2)/sqrt(length(errors_2)), sep=" "))

[1] "Model definitions:"
[[1]]
[[1]][[1]]
 [1] 1 1 0 1 1 1 1 1 1 2 1 1

[[1]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0


[[2]]
[[2]][[1]]
 [1] 1 1 2 1 1 1 1 2 1 1 1 1

[[2]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
[112] 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0


[[3]]
[[3]][[1]]
 [1] 1 1 1 1 1 1 1 1 1 2 2 1

[[3]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 0

[1] "mean error of baseline model   =  0.570006915271802"
[1] "mean error of ensemble         =  0.561367379094108"
[1] "SEM of error of baseline model =  0.00356751251098776"
[1] "SEM of error of ensemble       =  0.0014630142723505"


In [93]:
hp <- get_default_hyperparameters()

attr(hp, "verbose") <- TRUE

attr(hp, "test_initial_model") <- TRUE
errors_1 <- evaluate_ensemble(hp)
attr(hp, "test_initial_model") <- FALSE
attr(hp, "ensemble_size") <- 5
attr(hp, "top_level_repeats") <- 2
attr(hp, "return_all_model_outputs") <- TRUE
output_list_2 <- evaluate_ensemble(hp)
errors_2 <- output_list_2[[1]]
ensemble_error_list <- output_list_2[[2]]
single_model_error_list <- output_list_2[[3]]


print(paste("mean error of baseline model      = ", mean(errors_1), sep=" "))
print(paste("mean error of ensemble            = ", mean(errors_2), sep=" "))
print(paste("SEM of error of baseline model    = ", sd(errors_1)/sqrt(length(errors_1)), sep=" "))
print(paste("SEM of error of ensemble          = ", sd(errors_2)/sqrt(length(errors_2)), sep=" "))

#examine difference between error of entire ensemble and error of individual models

print("difference between ensemble-averaged prediction and single-model predictions:")
print("")

for(i in 1:length(ensemble_error_list)){
    print(paste("error of ensemble                 = ", ensemble_error_list[[i]], sep=" "))
    print("errors of single models           =")
    print(single_model_error_list[[i]])
    print(paste("mean difference ensemble - single = ", 
                mean(ensemble_error_list[[i]] - single_model_error_list[[i]]), sep=" "))
}

[1] "Model definitions:"
[[1]]
[[1]][[1]]
 [1] 1 1 1 1 1 1 2 1 1 2 1 1

[[1]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
[112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0


[[2]]
[[2]][[1]]
 [1] 1 1 1 1 1 3 1 0 1 1 1 1

[[2]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0
[112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0


[[3]]
[[3]][[1]]
 [1] 1 1 0 1 1 2 1 1 1 1 1 1

[[3]][[2]]
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 0

[1] "mean error of baseline model      =  0.569987118714443"
[1] "mean error of ensemble            =  0.562673670182696"
[1] "SEM of error of baseline model    =  0.00701308606170869"
[1] "SEM of error of ensemble          =  0.00263084775357356"
[1] "difference between ensemble-averaged prediction and single-model predictions:"
[1] "error of ensemble                 =  0.562779667010216"
[1] "errors of single models           = \n"
 [1] 0.5623772 0.5687355 0.5659111 0.5628077 0.5711465 0.5683064 0.5681874
 [8] 0.5661382 0.5663612 0.5668976
[1] "mean difference ensemble - single =  -0.00390719342035793"
[1] "error of ensemble                 =  0.55045954372604"
[1] "errors of single models           = \n"
 [1] 0.5471491 0.5534135 0.5474602 0.5553511 0.5525944 0.5569561 0.5572278
 [8] 0.5559926 0.5532047 0.5566944
[1] "mean difference ensemble - single =  -0.00314484574588236"
[1] "error of ensemble                 =  0.570507182944093"
[1] "errors of single models           = \n"
 [1

In [80]:
#return the default hyperparameter object
get_default_hyperparameters <- function(){
    
    hyperparameters <- list()
    
    #dataset-specific settings:
    
    attr(hyperparameters, "main_url") <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
    attr(hyperparameters, "files")    <- c("winequality-red.csv", "winequality-white.csv")
    attr(hyperparameters, "response_name") <- "quality"
    
    attr(hyperparameters, "verbose") <- FALSE
    
    
    #hyperparameters of the ensemble:
    
    attr(hyperparameters, "test_initial_model") <- FALSE
    #if this is set to TRUE, then do not attempt to improve linear models or form ensemble
    #instead just fit a single linear model to data, with powers given by initial settings below
    
    attr(hyperparameters, "initial_power") <- 1
    attr(hyperparameters, "initial_interaction") <- 0
    #polynomial powers for variables and interactions in the initial linear model
    #both set to 1: initial model has all variables and all interactions
    #both set to 0: initial model is empty (has only intercept)
    #variables = 1, interactions = 0 : initial model has all variables, no interactions
    #(these are probably the only sensible settings)
    
    attr(hyperparameters, "top_level_repeats") <- 1
    #number of repeats to perform at top level (evaluating entire ensemble-forming strategy)
    
    attr(hyperparameters, "ensemble_K")  <- 5
    #number of folds of cross-validation for evaluating entire ensembles
    
    
    attr(hyperparameters, "ensemble_size") <- 10
    #number of models in ensemble
    attr(hyperparameters, "n_attempts") <- 20
    #number of attempts made to improve a single regression model
    #(by changing the polynomial degreee of variables, or adding/dropping interactions)
    
    attr(hyperparameters, "single_model_K") <- 5
    #number of folds of cross-validation for evaluating single models
    
    attr(hyperparameters, "max_power") <- 4
    #max polynomial power to raise a variable to 
    attr(hyperparameters, "weight_by_scores") <- FALSE
    #whether to weight the single models in the ensemble by their performance on out-of-fold data
    #when taking the average over all models
    
    attr(hyperparameters, "P_change_interactions") <- 0.5
    #probability of changing an interaction in a step of model adjustment
    #as opposed to changing a variable power
    
    attr(hyperparameters, "return_all_model_outputs") <- FALSE
    #if FALSE, return the ensemble average (this is usually what we want)
    #if TRUE, return a matrix consisting of the predictions of each individual model
    return(hyperparameters)
}

In [96]:
#main function for evaluating different approaches
# hyperparameters : list with settings and parameters for ensemble generation
# returns: vector of scores of ensembles on hold-out data
evaluate_ensemble <- function(hyperparameters){

    verbose <- attr(hyperparameters, "verbose")
    K <- attr(hyperparameters, "ensemble_K")
    R <- attr(hyperparameters, "top_level_repeats")
    
    main_url <- attr(hyperparameters, "main_url")
    files    <- attr(hyperparameters, "files")
    response_name <- attr(hyperparameters, "response_name")
    
    have_data <- find_or_download_data(main_url, files)
    if(!have_data){
        print("Could not find or download data - returning null")
        return(NULL)
    }
    else{
        
        data <- read_data(files)
        
        data <- preprocess(data, response_name)
        
        CV_indices <- get_CV_folds(nrow(data), K)
        train_indices <- CV_indices[[1]]
        test_indices  <- CV_indices[[2]]
        
        error_matrix = matrix(0, R, K)
        #matrix of errors of all ensembles
        #dimensions: R (number of top-level repeats) x K (number of CV folds)
        
        
        #these are used if attr(hyperparameters, "return_all_model_outputs") = TRUE
        ensemble_error_list = list()
        single_model_error_list = list()
        
        for (r in 1:R){
            if(verbose){
                #print(paste('Top-level repeat', r, sep=' '))
            }
            for (k in 1:K){
                if(verbose){
                    #print(paste('Top-level CV fold', k, sep=' '))
                }
                if(attr(hyperparameters, "test_initial_model")){
                    #just fit one linear model to original predictors
                    #this is for purposes of comparison with different ensembles
                    initial_power       <- attr(hyperparameters, "initial_power")
                    initial_interaction <- attr(hyperparameters, "initial_interaction")
                    
                    npred <- ncol(data)-1
                    powers <- rep(initial_power, npred)
                    interactions <- rep(initial_interaction, npred*npred)
                    
                    predictions <- polyreg(data, powers, interactions, train_indices[[k]], test_indices[[k]])
                    error_matrix[r,k] <- mean(abs(data[test_indices[[k]],]$y-predictions))
                    
                }
                else{
                    #first step: find an ensemble of models
                    #by searching the space of possible models
                    #(this is the space of 2^P possible subsets of all predictors)
                    outputs <- find_ensemble(data[train_indices[[k]],])
                    errors <- outputs[[1]]
                    ensemble_definitions <- outputs[[2]]
                    if(verbose){
                        print('Model definitions:')
                        print(ensemble_definitions)
                        print('Errors:')
                        print(errors)
                    }
                    #now train the ensemble on the same data
                    
                    
                    if(attr(hyperparameters, "return_all_model_outputs")){
                        #returned full matrix with individual predictions of all models
                        output_list <- run_ensemble(ensemble_definitions, data, train_indices[[k]], test_indices[[k]], scores, hyperparameters)
                        predictions <- output_list[[1]]
                        #these are the final predictions, already averaged over ensemble
                        single_model_matrix <- output_list[[2]]
                        ensemble_error <- mean(abs(predictions - data$y[test_indices[[k]]]))
                        single_model_error <- rep(0, dim(single_model_matrix)[2])
                        for(u in 1:length(single_model_error)){
                            single_model_error[u] <- mean(abs(single_model_matrix[,u] - data$y[test_indices[[k]]]))
                        }
                        ensemble_error_list[[length(ensemble_error_list)+1]] <- ensemble_error
                        single_model_error_list[[length(single_model_error_list)+1]] <- single_model_error

                    }
                    else{
                        predictions <- run_ensemble(ensemble_definitions, data, train_indices[[k]], test_indices[[k]], scores, hyperparameters)
                    }
                    error_matrix[r,k] <- mean(abs(predictions - data$y[test_indices[[k]]]))
                }
            }
        }
    }
    
    
    dim(error_matrix) <- NULL
    if(attr(hyperparameters, "return_all_model_outputs")){
        return(list(error_matrix, ensemble_error_list, single_model_error_list))
    }
    else{
        return(error_matrix)
    }
}

In [81]:
#z-score predictors
preprocess <- function(data, response_name, hyperparameters=get_default_hyperparameters()){
    
    #separate into response and predictors
    response <- data[[response_name]]
    predictors <- data
    predictors[[response_name]] <- NULL
        
    #z-score predictors
    znorm <- function(x)  (x - mean(x))/ sd(x)
    predictors <- apply(predictors, 2, znorm)
    predictors <- as.data.frame(predictors)
        
    #combine into single matrix again
    data <- cbind(response, predictors)
    colnames(data)[1] <- "y"
    
    return(data)
    
}

In [82]:
#run a prespecified ensemble on a dataset
#train the models on datapoints given by train_indices
#make predictions on datapoints given by test_indices, return these predictions
run_ensemble <- function(ensemble_models, data, train_indices, test_indices, scores=NULL, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    ensemble_size <- length(ensemble_models)
    predictions <- vector("list", ensemble_size)
    for (n in 1:ensemble_size){
        if(verbose){
            print(paste("running model", n, "/", ensemble_size, sep=" "))
        }
        
        models <- ensemble_models[[n]]
        
        powers <- models[[1]]
        interactions <- models[[2]]
        
        predictions[[n]] <- polyreg(data, powers, interactions, train_indices, test_indices)
    }
    predictions_matrix <- t(Reduce(rbind, predictions))
    #matrix of all predictions from single models
    #rows correspond to datapoints, columns correspond to individual models
    
    weights = rep(1/length(ensemble_models), length(ensemble_models))
    if(length(scores)>0 && attr(hyperparameters, "weight_by_scores")){
        weights = -log(scores)
        weights = weights/sum(weights)
    }
    else{
        #assume NULL was passed for scores
        weights = rep(1/length(ensemble_models), length(ensemble_models))
    }
    weighted_avg <- rep(0, length(test_indices))
    for (n in 1:ensemble_size){    
        weighted_avg <- weighted_avg + weights[n]*predictions_matrix[,n]
    }
    
    
    if(attr(hyperparameters, "return_all_model_outputs")){
        return(list(weighted_avg, predictions_matrix))
    }
    else{
        return(weighted_avg)
    }
}

In [49]:
#generate ensemble by generating multiple single models
find_ensemble <- function(data, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
     
    ensemble_size <- attr(hyperparameters, "ensemble_size")
    ensemble = vector("list", ensemble_size)
    scores = rep(0, ensemble_size)
    
    for (n in 1:ensemble_size){
    
        if(verbose){
            print(paste("finding model", n, "/", ensemble_size, sep=" "))
        }
        outputs <- find_single_model(data, hyperparameters=hyperparameters, verbose=verbose)
        scores[n] <- outputs[[1]]
        ensemble[[n]] <- outputs[2:3]
    }
    return(list(scores, ensemble))
}

In [50]:
#function for polynomial-regression-based prediction
# data   : data matrix
# powers : vector of polynomial powers that each variable is raised to
#          power = 0 : left out of model
# interactions : vector of interactions
#                if there are P predictors, i > j  and interactions[(i-1)*P + j] > 0, 
#                then the interaction between predictors i and j is in the model
#                (irrespective of the polynomial powers of predictors i and j)
#train_indices : indices of rows to use for training model
#test_indices  : indices of rows to use for testing/prediction
polyreg <- function(data, powers, interactions, train_indices, test_indices){
    
    response <- data$y
    predictors <- data
    predictors[["y"]] <- NULL
    pred_poly_matrix <- NULL
    for(p in 1:ncol(predictors)){
        if(powers[[p]] > 0){
            m <- poly(predictors[[p]], powers[[p]])
            colnames(m) <- paste(colnames(predictors)[p], colnames(m), sep="")
            if(is.null(pred_poly_matrix)){
                pred_poly_matrix <- m
            }
            else{
                pred_poly_matrix <- cbind(pred_poly_matrix, m)
            }
        }
    }
    for(p1 in 1:ncol(predictors)){
        for(p2 in 1:ncol(predictors)){

            if(p1>p2 && interactions[(p1-1)*ncol(predictors) + p2] > 0){
                pred_poly_matrix <- cbind(predictors[[p1]]*predictors[[p2]], pred_poly_matrix)
                colnames(pred_poly_matrix)[1] <- paste(colnames(predictors)[p1], colnames(predictors)[p2], sep="x")
            
            }
        }
    }
    full_data <- as.data.frame(cbind(response, pred_poly_matrix))
    colnames(full_data)[1] <- "y"
    
    #now the full_data dataframe includes all the powers and interactions that we want
    #so the formula is just y ~. , i.e. regress y onto just the columns of the dataframe as they are, 
    #do not add any more powers/interactions    
    model  <- lm(y ~ ., data=full_data[train_indices,])
    
    options(warn=-1)      #turn off warnings
    prediction <- predict(model, newdata=full_data[test_indices,])
    options(warn=1) 
            
    return(prediction)
}

In [60]:
#find a single model
#to serve as part of an ensemble
#by starting with initial polynomial regression model 
#and iteratively attempting to improve it
find_single_model <- function(data, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    K <- attr(hyperparameters, "single_model_K")
    CV_folds   <- get_CV_folds(nrow(data), K)
    train_indices <- CV_folds[[1]]
    test_indices <- CV_folds[[2]]
    
    n_attempts <- attr(hyperparameters, "n_attempts")
    max_power  <- attr(hyperparameters, "max_power")
    initial_power <- attr(hyperparameters, "initial_power")
    initial_interaction <- attr(hyperparameters, "initial_interaction")
    do_interaction <- attr(hyperparameters, "P_change_interactions")
    
    response <- data$y
    predictors <- data
    predictors[["y"]] <- NULL
    n_predictors <- ncol(predictors)
    
    current_powers <- rep(initial_power, n_predictors)
    best_powers <- rep(initial_power, n_predictors)
    current_interactions <- rep(initial_interaction, n_predictors*n_predictors)
    best_interactions <- rep(initial_interaction, n_predictors*n_predictors)
    is_dummy <- rep(FALSE, n_predictors)
    for(i in 1:length(is_dummy)){
        is_dummy[i] <- length(unique(predictors[[i]])) <= 2
    }
    
    best_score <- Inf
    
    null_model_score <- 0
    
    for(n in 1:n_attempts){
        
        
        if(n == 1){
            #do nothing here in the first iteration
        }
        else{
            #in iterations after the first, make random changes to the model
            
            
            if(runif(1) < do_interaction){
                #adjust one randomly chosen interaction
                ps <- sample(1:n_predictors, 2, replace=FALSE)
                p1 <- max(ps)
                p2 <- min(ps)
                index <- (p1-1)*n_predictors + p2
                if(current_interactions[index]==1){
                    current_interactions[index]<-0
                }
                else{
                    current_interactions[index]<-1  
                }
            }
            else{
                #adjust power of a randomly chosen variable
                index <- sample(1:n_predictors, 1)
                sign  <- sample(c(-1, 1), 1)
                if(current_powers[index] == max_power && sign==1){
                    sign <- -1
                }
                if(current_powers[index]==0 && sign==-1){
                    sign <-  1
                }
                if(is_dummy[index] && current_powers[index] == 1){
                    sign <- -1
                }
                current_powers[index] <- current_powers[index] + sign
            }
        }
        if(verbose){
            print(paste("Evaluating model:", paste(current_powers, sep=" ", collapse=", "), sep=" "))
        }
        
        oof_scores <- rep(0, K)
        
        for(k in 1:K){
            if(sum(current_powers)==0){
                #empty model - just use mean as intercept
                train_Y <- response[train_indices[[k]]]
                test_Y  <- response[test_indices[[k]]] 
                prediction <- mean(train_Y)
                oof_scores[k] <- mean(abs(test_Y - prediction))
            }
            else{
                
                y_pred <- polyreg(data, current_powers, current_interactions, train_indices[[k]], test_indices[[k]])
            
                oof_scores[k] <- mean(abs(response[test_indices[[k]]] - y_pred))
            }

        }#end loop over folds
        
        if(n==1){
            starting_model_score <- mean(oof_scores)
        }
        
        #if the model is an improvement over previous best, update it
        final_score <- mean(oof_scores)
        
        if(final_score < best_score){
            output_message = "Keeping model"
            best_score <- final_score
            best_powers <- current_powers
            best_interactions <- current_interactions
        }
        else{
            output_message = "Discarding model"
            current_powers <- best_powers
            current_interactions <- best_interactions
        }
        if(verbose){
            print(paste("Model score =", final_score, sep=" "))
            print(output_message)
        }
    }
    
    #return description of this model
    #as a list with: CV score, vector of powers, vector of interactions
    return(list(best_score, best_powers, best_interactions))
}

In [52]:
#check if data files are present in local directory
#if they are not, try to download
#return 1 if files are now present locally (either found or successfully downloaded)
#return 0 otherwise
#if this returns 1, we expect a subsequent call to read_data to succeed
find_or_download_data <- function (main_url, files){

    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find", files[i], "- attempting to download", sep=" " ))
            download.file(paste(main_url, files[i],sep="") , files[i], "auto", quiet = FALSE)
        }
    }
    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find or download", files[i], "- training will fail", sep=" " ))
            return(0)
        }
    }
    return(1)
}

In [53]:
#read data from .csv files to data.frame
#merge contents of multiple .csv into one dataframe (binds rows)
#add a one-hot encoding of the file number
read_data <- function(files){
    single_datasets <- vector("list", length(files))
    for (i in 1:length(files)){
        single_datasets[[i]] <- read.csv(files[i], header=TRUE, sep=";")
        single_datasets[[i]]$file.number <- i
    }
    data <- bind_rows(single_datasets) 
    #bind_rows : dplyr function
    
    #data is now full dataset (merged across all files)
        
    #make one-hot encoding of file number
    u <- unique(data$file.number)
    for (val in u[1:(length(u)-1)]){
        data[paste("file.number.",val,sep="")] <- (data$file.number==val)*1.0
    }
    data$file.number <- NULL
    #remove initial (categorical) file number column
    return(data)
}

In [54]:
#randomly sample indices of crossvalidation folds
# N  : number of data points
# K  : number of folds
get_CV_folds <- function(N, K){
    train_indices <- vector("list", K)
    test_indices  <- vector("list", K)
    random_permutation <- sample(N, N, replace=FALSE)
    for (i in 1:K){
        start <- floor(N*(i-1)/K) + 1
        stop  <- floor(N*i/K)
        test_indices[[i]] <-  random_permutation[start:stop]
        train_indices[[i]] <- random_permutation[-(start:stop)]
    }
    return(list(train_indices, test_indices))
}

In [55]:
#define shorter names for this specific dataset
#to help plotting
short_names <- function(){
    sn <- list(fixed.acidity="fix.acd",
               volatile.acidity="vol.acd",
                citric.acid="cit.acd",
                residual.sugar="res.sug",
                chlorides="chlor",
                free.sulfur.dioxide="fr.SO2",
                total.sulfur.dioxide="tot.SO2",
                density="dens",
                pH="pH",
                sulphates="sulph",
                alcohol="alc",
                file.number.1="red")
    return(sn)
}

In [56]:
#functions for EDA
library(gridExtra)
library(ggplot2)
library(reshape2)


reorder_cormat <- function(cormat){
    # Use correlation between variables as distance
    dd <- as.dist((1-cormat)/2)
    hc <- hclust(dd)
    cormat <-cormat[hc$order, hc$order]
}

get_breaks <- function(x, margin, n_breaks){
    m <- margin*(max(x) - min(x))
    
    return(seq(min(x) + m, max(x) - m, (max(x) - min(x) - 2*m)/(n_breaks-1)))
}

#main function for EDA
run_eda <- function(to_show, hyperparameters, corr_cutoff=0.5, boxplot_predictor_groups=3){
    main_url <- attr(hyperparameters, "main_url")
    files    <- attr(hyperparameters, "files")
    response_name <- attr(hyperparameters, "response_name")
    
    have_data <- find_or_download_data(main_url, files)
    if(!have_data){
        print("Could not find or download data - returning null")
        return(NULL)
    }
    else{
        data <- read_data(files)
        
        #plot relationship between each predictor and response
        predictor_names <- colnames(data)[colnames(data) != response_name]
        
        if(to_show=="pairs"){
        #pairs plot
            
            sn <- short_names()
            labels <- list()
            for (i in 1:length(predictor_names)){
                labels[i] <- sn[[predictor_names[i]]]
            }
        
            pairs(data[,predictor_names], labels=labels, panel=points, pch = 16, cex = .5, xaxt = "n", yaxt = "n")
            
            #alternative: ggally pairs plot
            #this does not handle the large number of variables well
        
            ## code for ggally pairs plot:
            #pairplot_columns <- 1:length(colnames(data))
            #pairplot_columns <- pairplot_columns[colnames(data) != response_name]
            #pairplot <- ggpairs(data, columns=pairplot_columns) + 
            #  ggtitle("Pairs plot")
            #print(pairplot)
        }
        else if(to_show=="corr_heatmap"){
        #correlation heatmap
        
            predictors <- data[,predictor_names]
            colnames(predictors) <- labels
        
            cormat <- round(cor(predictors),2)
            cormat <- reorder_cormat(cormat)
        
            cormat[lower.tri(cormat)]<- NA
        
            # Melt the correlation matrix
            melted_cormat <- melt(cormat, na.rm = TRUE)
            # Heatmap
            ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
                 geom_tile(color = "white")+
                 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                 midpoint = 0, limit = c(-1,1), space = "Lab", 
                 name="Pearson\nCorrelation") +
                 theme_minimal()+ 
                 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                 size = 12, hjust = 1))+
                 coord_fixed()
        }
        else if(to_show=="corr_values"){
        #print pairs of variables with largest absolute correlation values

            melted_cormat <- melt(cormat, na.rm=TRUE)
            melted_cormat <- melted_cormat[order(abs(melted_cormat$value)),]
            print(melted_cormat[abs(melted_cormat$value) > corr_cutoff & melted_cormat$Var1 != melted_cormat$Var2,])
        }
        else if(to_show=="response_hist"){
        #plot histogram of response values
            par(pin=c(4,3))
            vals <- data[[response_name]]
            h <- hist(vals, xlab=response_name, ylab="count", breaks=seq(min(vals)-0.5, max(vals) + 0.5, 1), 
                 main=paste("Histogram of ", response_name, sep=" "),ylim=c(0, 3000))
            #plot(h, )
            scale <- length(vals)
            curve(dnorm(x, mean=mean(vals), sd=sd(vals))*scale, add=TRUE, col="darkblue", lwd=2)

        }
        else if(to_show=="boxplots_grouped_by_response"){
        #plot series of boxplots grouped by values of response (on x axis)
        #only works if response takes discrete values
        #has some limitations, but can be useful
            
            label_format <- function(x) sprintf("%.2f", x)
            breaks <- function(x) get_breaks(x, 0.05, 4)
            
            nplot <- length(predictor_names)
            nrow <- floor(sqrt(nplot))
            ncol <- ceiling(nplot/nrow)
            plots <- list()
            for(i in 1:length(predictor_names)){
                x <- predictor_names[i]
                if(length(unique(data[[x]])) > 2){
                    #exclude any one-hot encodings
                    plots[[i]] <- ggplot(data, aes_string(group=response_name, y = x, x = response_name)) + 
                        geom_boxplot(outlier.size=0.15) + geom_smooth(aes(group=1), method="lm") + 
                        scale_y_continuous(labels=label_format) + 
                        theme(axis.text.x = element_text(color = "grey20", size = 8, angle = 90, hjust = .5, vjust = .5, face = "plain")) +
                        coord_flip()
                    #have to add aes(group=1) to make smoothing work
                }
            }
            grid.arrange(grobs=plots, ncol=ncol)
        }
        else if(to_show=="boxplots_predictor_on_x"){
        #plot series of boxplots grouped by values of predictor (on x axis)  
        #divide predictor into quantiles for grouping
        #this tends to give ugly plots with issues that make them hard to interpret
        #not used further
            qstep <- 1/boxplot_predictor_groups
            nplot <- length(predictor_names)
            nrow <- floor(sqrt(nplot))
            ncol <- ceiling(nplot/nrow)
            plots <- list()
            for(i in 1:length(predictor_names)){
                x <- predictor_names[i]
                if(length(unique(data[[x]])) > 2){
                    #exclude any one-hot encodings
                    quantile_boundaries <- quantile(data[[x]], probs=seq(0,1,qstep))
                    quantile_groups <- rep(0, length(data[[x]]))
                    for (q in quantile_boundaries){
                        quantile_groups <- quantile_groups + 1.0*(data[[x]] > q)
                    }
                    plots[[i]] <- ggplot(data, aes_string(group=quantile_groups, y = response_name, x = x)) + 
                    geom_boxplot(outlier.size=0.15) + geom_smooth(aes(group=1), method="lm") 
                        
                    #have to add aes(group=1) to make smoothing work
                }
            }
            grid.arrange(grobs=plots, ncol=ncol)
        }
        else if(to_show=="separate_regressions"){
            
            label_format <- function(x) sprintf("%.2f", x)
            
            nplot <- length(predictor_names)
            nrow <- floor(sqrt(nplot))
            ncol <- ceiling(nplot/nrow)
            plots <- list()
            for(i in 1:length(predictor_names)){
                x <- predictor_names[i]
                if(length(unique(data[[x]])) > 2){
                    #exclude any one-hot encodings
                    plots[[i]] <- ggplot(data, aes_string(y = x, x = response_name)) + 
                        geom_count() + geom_smooth(aes(group=1), method="lm") + theme(legend.position = "none") +
                        scale_y_continuous(labels=label_format) + 
                        theme(axis.text.x = element_text(color = "grey20", size = 8, angle = 90, hjust = .5, vjust = .5, face = "plain")) +
                        coord_flip()
                    #have to add aes(group=1) to make smoothing work
                }
            }
            grid.arrange(grobs=plots, ncol=ncol)
        } 
    }
}