In [1]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [7]:
#for simplicity, have one hyperparameters object that contains all variables
get_default_hyperparameters <- function(){
    
    hyperparameters <- list()
    
    #dataset-specific settings:
    attr(hyperparameters, "main_url") <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
    attr(hyperparameters, "files")    <- c("winequality-red.csv", "winequality-white.csv")
    attr(hyperparameters, "response_name") <- "quality"
    
    
    attr(hyperparameters, "ensemble_K")  <- 5
    #number of folds of cross-validation for evaluating entire ensembles
    attr(hyperparameters, "top_level_repeats") <- 10
    #number of repeats to perform at top level (evaluating entire ensemble-forming strategy)
    attr(hyperparameters, "ensemble_size") <- 2
    attr(hyperparameters, "n_attempts") <- 3
    attr(hyperparameters, "single_model_K") <- 10
    #number of folds of cross-validation for finding single models
    attr(hyperparameters, "retest_predictors") <- FALSE
    attr(hyperparameters, "weight_by_scores") <- TRUE
    return(hyperparameters)
}

In [36]:
#this function does all preprocessing (normalization etc.)
#and defines new features by applying transformations
preprocess <- function(data, response_name, hyperparameters=get_default_hyperparameters()){
    
    #separate into response and predictors
    response <- data[[response_name]]
    predictors <- data
    predictors[[response_name]] <- NULL
        
    #z-score predictors
    znorm <- function(x)  (x - mean(x))/ sd(x)
    predictors <- apply(predictors, 2, znorm)
    
    #define new features
    
    predictors <- as.data.frame(predictors)
    
    #combine into single matrix again
    data <- cbind(response, predictors)
    colnames(data)[1] <- "y"
    
    return(data)
    
}

In [74]:
run_ensemble <- function(ensemble_models, data, scores=NULL, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    weight_by_scores <- attr(hyperparameters, "weight_by_scores")
    
    ensemble_size <- length(ensemble_models)
    predictions <- vector("list", ensemble_size)
    for (n in 1:ensemble_size){
        if(verbose){
            print(paste("running model", n, "/", ensemble_size, sep=" "))
        }
        predictions[[n]] <- predict(ensemble_models[[n]], newdata=data)
    }
    predictions_matrix <- t(Reduce(rbind, predictions))
    
    weights = rep(1/length(ensemble_models), length(ensemble_models))
    if(length(scores)>0){
        weights = -log(scores)
        weights = weights/sum(weights)
    }
    else{
        #assume NULL was passed for scores
        weights = rep(1/length(ensemble_models), length(ensemble_models))
    }
    weighted_avg <- rep(0, nrow(data))
    for (n in 1:ensemble_size){    
        weighted_avg <- weighted_avg + weights[n]*predictions_matrix[,n]
    }
    
    return(weighted_avg)
}

In [66]:
train_ensemble <- function(ensemble_definition, data, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    ensemble_size <- length(ensemble_definition)
    ensemble_models <- vector("list", ensemble_size)
    n_comp <- attr(hyperparameters, "pls_components")
    modele <- attr(hyperparameters, "pls_model")
    
    for (n in 1:ensemble_size){
        if(verbose){
            print(paste("training model", n, "/", ensemble_size, sep=" "))
        }
        modelspec <- ensemble_definition[[n]]
        train_data <- cbind(data$y, data[,modelspec])
        colnames(train_data)[1] <- "y"
        ensemble_models[[n]] <- lm(y ~ (.)^2, data=train_data)
    }
    
    return(ensemble_models)
}

In [47]:
find_ensemble <- function(data, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    
    ensemble_size <- attr(hyperparameters, "ensemble_size")
    ensemble = vector("list", ensemble_size)
    scores = rep(0, ensemble_size)
    
    for (n in 1:ensemble_size){
    
        if(verbose){
            print(paste("finding model", n, "/", ensemble_size, sep=" "))
        }
        outputs <- find_single_model(data, hyperparameters=hyperparameters, verbose=verbose)
        scores[n] <- outputs[[1]]
        ensemble[[n]] <- outputs[[2]]
    }
    return(list(scores, ensemble))
}

In [63]:
#find a single model
#to serve as part of an ensemble
#return set of predictors used for that model
find_single_model <- function(data, hyperparameters=get_default_hyperparameters(), verbose=FALSE){
    
    K <- attr(hyperparameters, "single_model_K")
    CV_folds   <- get_CV_folds(nrow(data), K)
    train_indices <- CV_folds[[1]]
    test_indices <- CV_folds[[2]]
    n_attempts <- attr(hyperparameters, "n_attempts")
    n_comp <- attr(hyperparameters, "pls_components")
    modele <- attr(hyperparameters, "pls_model")
    retest <- attr(hyperparameters, "retest_predictors")
    use_wvc <- FALSE#attr(hyperparameters, "use_wvc")
    
    response <- data$y
    predictors <- data
    predictors[["y"]] <- NULL
    
    
    predictor_set <- c()
    best_predictor_set <- c()
    tested_predictors <- c()
    best_score <- Inf
    
    null_model_score <- 0
    
    for(n in 1:n_attempts){
        
        
        current_predictor_set <- best_predictor_set
        
        #for the first iteration, run the empty model
        #to get performance of intercept
        #after first iteration, add predictors to model
        if(n > 1){
            all_predictors <- colnames(predictors)
            unused_predictors <- c()
            for(j in 1:length(all_predictors)){
                p <- all_predictors[[j]]
                if(!( p %in% current_predictor_set) && (retest || !(p %in% tested_predictors))){
                    unused_predictors <- c(unused_predictors, p)
                }
            }
            p <- sample(unused_predictors, 2, replace=FALSE)
            current_predictor_set <- c(current_predictor_set, p)
            tested_predictors <- p
        }
        if(verbose){
            print(paste("Evaluating model:", paste(current_predictor_set, sep=" ", collapse=", "), sep=" "))
        }
        
        oof_scores <- rep(0, K)
        
        
        for(k in 1:K){
            if(length(current_predictor_set)==0){
                #empty model - just use mean as intercept
                train_Y <- response[train_indices[[k]]]
                test_Y  <- response[test_indices[[k]]] 
                prediction <- mean(train_Y)
                oof_scores[k] <- mean(abs(test_Y - prediction))
            }
            else{
                train_X <- predictors[train_indices[[k]],current_predictor_set]
                test_X  <- predictors[test_indices[[k]],current_predictor_set]
                train_Y <- response[train_indices[[k]]]
                test_Y  <- response[test_indices[[k]]] 
                
                train_data <- cbind(train_Y, train_X)
                test_data <- cbind(test_Y, test_X)
                colnames(train_data)[1] <- "y"
                colnames(test_data)[1] <- "y"

                model  <- lm(y ~ (.)^2, data=train_data)
            
                oof_scores[k] <- mean(abs(test_data$y- predict(model, newdata=test_data)))
            }

        }#end loop over folds
        
        if(length(current_predictor_set)==0){
            null_model_score <- mean(oof_scores)
        }
        
        
        #if the model is an improvement over previous best, update it
        final_score <- mean(oof_scores)
        
        if(final_score < best_score){
            output_message = "Keeping model"
            best_score <- final_score
            best_predictor_set <- current_predictor_set
        }
        else{
            output_message = "Discarding model"
        }
        if(verbose){
            print(paste("Model score =", final_score, sep=" "))
            print(output_message)
        }
    }
    
    
    #return description of this model
    #as a list of predictor names (strings)
    return(list(best_score/null_model_score, best_predictor_set))
}

In [75]:
dev_script()

[1] 1 2
[1] "top-level CV fold  1"
[[1]]
[1] "alcohol"              "sulphates"            "total.sulfur.dioxide"
[4] "file.number=1"       

[[2]]
[1] "residual.sugar"   "volatile.acidity" "fixed.acidity"    "chlorides"       

[1] 0.8654326 0.9152374
[1] "score = 0.60021809939634"
[1] "top-level CV fold  2"
[[1]]
[1] "total.sulfur.dioxide" "sulphates"            "residual.sugar"      
[4] "density"             

[[2]]
[1] "pH"                  "citric.acid"         "free.sulfur.dioxide"
[4] "sulphates"          

[1] 0.8951830 0.9808707
[1] "score = 0.607297712538401"
[1] "top-level CV fold  3"
[[1]]
[1] "sulphates"            "residual.sugar"       "total.sulfur.dioxide"
[4] "chlorides"           

[[2]]
[1] "chlorides"        "file.number=1"    "volatile.acidity" "density"         

[1] 0.9352931 0.9128431
[1] "score = 0.626131561927111"
[1] "top-level CV fold  4"
[[1]]
[1] "file.number=1"       "free.sulfur.dioxide" "fixed.acidity"      
[4] "alcohol"            

[[2]]
[1] "densi

NULL

In [9]:
find_or_download_data <- function (main_url, files){

    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find", files[i], "- attempting to download", sep=" " ))
            download.file(paste(main_url, files[i],sep="") , files[i], "auto", quiet = FALSE)
        }
    }
    for (i in 1:length(files)){
        if(!file.exists(files[i])){
            print(paste("Could not find or download", files[i], "- training will fail", sep=" " ))
            return(0)
        }
    }
    return(1)
}

In [10]:
get_CV_folds <- function(N, K){
    train_indices <- vector("list", K)
    test_indices  <- vector("list", K)
    random_permutation <- sample(N, N, replace=FALSE)
    for (i in 1:K){
        start <- floor(N*(i-1)/K) + 1
        stop  <- floor(N*i/K)
        test_indices[[i]] <-  random_permutation[start:stop]
        train_indices[[i]] <- random_permutation[-(start:stop)]
    }
    return(list(train_indices, test_indices))
}

In [43]:
#script for evaluating different approaches
evaluate_ensemble <- function(hyperparameters){

    
    verbose <- FALSE
    
    
    
    have_data <- find_or_download_data(main_url, files)
    if(!have_data){
        print("Could not find or download data - returning null")
        return(NULL)
    }
    else{
        single_datasets <- vector("list", length(files))
        for (i in 1:length(files)){
            single_datasets[[i]] <- read.csv(files[i], header=TRUE, sep=";")
            single_datasets[[i]]$file.number <- i
        }
        data <- cbind(single_datasets)
        data <- bind_rows(single_datasets)
        #data = full dataset (merged across all files)
        
        #make one-hot encoding of file number
        u <- unique(data$file.number)
        print(u)
        for (val in u[1:(length(u)-1)]){
            data[paste("file.number=",val,sep="")] <- (data$file.number==val)*1.0
        }
        data$file.number <- NULL
        
        data <- preprocess(data, response_name)
        
        CV_indices <- get_CV_folds(nrow(data), K)
        train_indices <- CV_indices[[1]]
        test_indices  <- CV_indices[[2]]
        
        for (i in 1:K){
            if(verbose){
                print(paste('Top-level CV fold ', i, sep=' '))
            }
            #first step: find an ensemble of models
            #by searching the space of possible models
            #(this is the space of 2^P possible subsets of all predictors)
            outputs <- find_ensemble(data[train_indices[[i]],])
            scores <- outputs[[1]]
            ensemble_definitions <- outputs[[2]]
            if(verbose){
                print('Model definitions:')
                print(ensemble_definitions)
                print('Scores:')
                print(scores)
            }
            #now train the ensemble on the same data
            ensemble_models <- train_ensemble(ensemble_definitions, data[train_indices[[i]],])
            predictions <- run_ensemble(ensemble_models, data[test_indices[[i]],], scores, hyperparameters)
            score <- mean(abs(predictions - data$y[test_indices[[i]]]))
            print(paste("Ensemble score =",score,sep=" "))
            
        }
  
    }
    return(NULL)
}

In [None]:
#top level code: define hyperparameters, run ensemble
