# Multiple Imputation with Ranger
## Without Leaking Information into Test
This is a function designed to impute missing data on train & test.
The function imputes the column with fewest missing values first, then iterates until the training set is complete.
The function captures the ranger model and makes blind predictions for imputation on a test set.

### NA_cleanup(input_data, test_index)
#### input_data <br>A dataframe of both train & test, for imputing
#### test_index <br>The vector of row indices indicating the test set
### Example
function_results = NA_cleanup(input_data = input_1,
                              test_index = 1:20)

In [1]:
NA_cleanup = function(input_data, test_index){

### Convert strings to factors
for(i in 1:ncol(input_data)){
  if(class(input_data[, i]) == "character"){
    input_data[, i] = as.factor(input_data[, i])
  }
}

### Split train / test
fn_ts_1 = input_data[test_index, ]
fn_tr_1 = input_data[-test_index, ]


data_row_ct = nrow(fn_tr_1)

#### NA flags data
### Which columns had NAs to begin with
Init_NAs_per_column = vapply(1:ncol(fn_tr_1), 
       FUN.VALUE = numeric(1), 
       function(i){
         fn_tr_1[, i] %>% 
           is.na() %>% 
           sum()
       })

### Which columns to create flags for
which_Init_NA_columns = which(Init_NAs_per_column > 0) 


## Setup the data as 0's
## train
NA_flags_columns = matrix(data = 0,
                          nrow = nrow(fn_tr_1),
                          ncol = length(which_Init_NA_columns)) %>% 
  as.data.frame()
names(NA_flags_columns) = names(fn_tr_1)[which_Init_NA_columns] %>% 
  paste0("_NA_flag")

## test - should be same columns but different number of rows
NA_flags_columns_ts = matrix(data = 0,
                             nrow = nrow(fn_ts_1),
                             ncol = length(which_Init_NA_columns)) %>% 
  as.data.frame()
names(NA_flags_columns_ts) = names(fn_ts_1)[which_Init_NA_columns] %>% 
  paste0("_NA_flag")

### Create the NA flags data
NA_flags_columns[is.na(fn_tr_1[, which_Init_NA_columns])] = 1
NA_flags_columns_ts[is.na(fn_ts_1[, which_Init_NA_columns])] = 1

### Add the NA_flags data back into main data
fn_tr_1 = fn_tr_1 %>% 
  bind_cols(NA_flags_columns)

fn_ts_1 = fn_ts_1 %>% 
  bind_cols(NA_flags_columns_ts)

### Extract NA count for each column
imputer_details = lapply(1:ncol(fn_tr_1), 
                      function(i){
                        column_list = vector("list", 5) # create 5 placeholder elements
                        column_list[[1]] = fn_tr_1[, i] %>% 
                          is.na() %>% 
                          sum() 
                         column_list[[2]] = class(fn_tr_1[, i])
                         column_list[[5]] = column_list[[1]]
                         names(column_list) = c("init_NA_ct", 
                                                "column_class", 
                                                "ranger_object",
                                                "input_columns",
                                                "running_NA_count")
                         column_list
                        })  

# # testing- make a test case NA
# col_in_test_with_added_NA = (1:ncol(fn_ts_1))[-which_Init_NA_columns][10]
# fn_ts_1[2:3, col_in_test_with_added_NA] = NA

### Check test set. Any columns in test where new NAs appear must be imputed first 
### Setup
test_NAs_per_column = vapply(1:ncol(fn_ts_1), FUN.VALUE = numeric(1), 
       function(i){
         fn_ts_1[, i] %>% 
           is.na() %>% 
           sum()
       })

which_NAs_columns_test = which(test_NAs_per_column > 0)

### Which columns have NAs only in test
columns_with_NAs_only_in_test = which_NAs_columns_test[-which(which_NAs_columns_test %in% which_Init_NA_columns)]
### Fill those columns
while(length(columns_with_NAs_only_in_test) > 0){
  i = columns_with_NAs_only_in_test[1]
  
  which_columns_for_train = 1:ncol(fn_tr_1) %>% 
    .[-c(which_Init_NA_columns, columns_with_NAs_only_in_test)] # remove any columns with NAs in train OR test
  
  imputer_model = ranger(data = fn_tr_1[, c(which_columns_for_train, i)],
                         dependent.variable.name = names(fn_ts_1)[i])
  
  column_predictions = predict(imputer_model,
                               data = fn_ts_1[, which_columns_for_train])$predictions
  
  ### Add back in the imputed values
  NA_rows = which(is.na(fn_ts_1[, i]))
  fn_ts_1[NA_rows, i] = column_predictions[NA_rows]
  rm(NA_rows)
  
  imputer_details[[i]][[3]] = imputer_model
  imputer_details[[i]][[4]] = names(fn_tr_1)[which_columns_for_train]
  
  columns_with_NAs_only_in_test = columns_with_NAs_only_in_test[-1]
}

##### Back to imputing train data
### Prep for imputation loop
NAs_per_column = vapply(1:length(imputer_details), 
       FUN.VALUE = numeric(1), 
       function(i){
         imputer_details[[i]][[5]]
       })  

### The main while loop: Impute NA's, column by column.
while(max(NAs_per_column) > 0){
min_NAs = NAs_per_column[which(NAs_per_column > 0)] %>% 
  min()

column_to_impute = which(NAs_per_column == min_NAs)[1]

complete_columns = which(NAs_per_column == 0)

data_with_1_column_to_impute = fn_tr_1[, c(complete_columns, column_to_impute)]

imputer_model = ranger(data = data_with_1_column_to_impute %>% 
                         na.omit(),
                       dependent.variable.name = names(data_with_1_column_to_impute)[length(data_with_1_column_to_impute)])

column_predictions = predict(imputer_model, 
                             data = data_with_1_column_to_impute[, -ncol(data_with_1_column_to_impute)])$predictions

column_predictions_test = 
  predict(imputer_model,
          data = fn_ts_1[,which(names(fn_ts_1) %in% names(data_with_1_column_to_impute[,-ncol(data_with_1_column_to_impute)]))])$predictions

### Add back in the imputed values
tmp = which(is.na(fn_tr_1[, column_to_impute]))
fn_tr_1[tmp, column_to_impute] = column_predictions[tmp]
rm(tmp)

tmp_ts = which(is.na(fn_ts_1[, column_to_impute]))
fn_ts_1[tmp_ts, column_to_impute] = column_predictions[tmp_ts]
rm(tmp_ts)

### Add model back to output object
imputer_details[[column_to_impute]][[3]] = imputer_model
imputer_details[[column_to_impute]][[4]] = names(data_with_1_column_to_impute)[1:(ncol(data_with_1_column_to_impute) - 1)]
imputer_details[[column_to_impute]][[5]] = fn_tr_1[, column_to_impute] %>% 
                          is.na() %>% 
                          sum()
if(imputer_details[[column_to_impute]][[5]] > 0){
  stop(paste0("Problem encountered: Impute left NA's in column ", column_to_impute))
}

NAs_per_column = vapply(1:length(imputer_details), 
       FUN.VALUE = numeric(1), 
       function(i){
         imputer_details[[i]][[5]]
         }) 
}

### Name list elements
names(imputer_details) = names(fn_tr_1)

### Last step: Prepare final output object
final_output = fn_tr_1 %>% 
  list(fn_ts_1, imputer_details) 
names(final_output) = c("train_data_imputed", "test_data_imputed", "imputer_details")
return(final_output)

}