In [6]:
import os

# A directory to be created for the results
folder = "auc_many_predictors"

try:
    os.mkdir(folder)
except OSError:
    print ("Creation of the directory %s failed (already exists?)" % folder)
else:
    print ("Successfully created the directory %s" % folder)

Successfully created the directory auc_many_predictors


In [7]:
# Load the rpy2 IPython extension into the notebook to use R with a magic command: %%R
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [8]:
%%R
# :::::::::
# R kernel
# :::::::::

install.packages("readxl")
install.packages("writexl")
install.packages("randomForestSRC")
install.packages("riskRegression")
install.packages("survival")
install.packages("cmprsk")
install.packages("prodlim")

_____________________________________________________________________________________________________________
Evaluate Random Forest on the validation data
_____________________________________________________________________________________________________________

In [9]:
%%R
# :::::::::
# R kernel
# :::::::::

library("readxl")
library("writexl")
library("randomForestSRC")
library("riskRegression")
library("survival")
library("cmprsk")
library("prodlim")

cohort <- read_excel("kihd_time_to_event.xlsx")
N_runs <- 10

for (i in seq(1,N_runs,1))
{
  
  kihd_train_file <- paste(c("splitting_658_predictors/training_valid_", i-1, ".xlsx"), collapse = "")
  kihd_test_file <- paste(c("splitting_658_predictors/validation_", i-1, ".xlsx"), collapse = "")
  
  kihd_train <- read_excel(kihd_train_file)
  kihd_test <- read_excel(kihd_test_file)
  
  colnames(kihd_train)[1] <- 'index'
  colnames(kihd_test)[1] <- 'index'
  
  kihd_train['time'] <- cohort[kihd_train$index + 1, 'time']/365.25
  kihd_test['time'] <- cohort[kihd_test$index + 1, 'time']/365.25
  
  kihd_train['event'] <- cohort[kihd_train$index + 1, 'event']
  kihd_test['event'] <- cohort[kihd_test$index + 1, 'event']
    
  kihd_train['index'] <- NULL
  kihd_test['index'] <- NULL
    
  ## --------------------------------------------------------------------------------------------------------------------------------
  ## CVD output
  ## --------------------------------------------------------------------------------------------------------------------------------

  rf5 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 5, case.wt = NULL, cause=1)
  
  rf10 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 10, case.wt = NULL, cause=1)
  
  rf15 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 15, case.wt = NULL, cause=1)
  
  rf20 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 20, case.wt = NULL, cause=1)
  
  rf25 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 25, case.wt = NULL, cause=1)
  
  rf30 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 30, case.wt = NULL, cause=1)
  
  score_test<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_test, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
  
  auc_result_file <- paste(c("auc_many_predictors/cvd_auc_validation_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_test$AUC$score), auc_result_file)
    
    
  ## --------------------------------------------------------------------------------------------------------------------------------
  ## nonCVD output
  ## --------------------------------------------------------------------------------------------------------------------------------

  rf5 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 5, case.wt = NULL, cause=2)
  
  rf10 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 10, case.wt = NULL, cause=2)
  
  rf15 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 15, case.wt = NULL, cause=2)
  
  rf20 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 20, case.wt = NULL, cause=2)
  
  rf25 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 25, case.wt = NULL, cause=2)
  
  rf30 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 30, case.wt = NULL, cause=2)
  
  score_test<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_test, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
  
  auc_result_file <- paste(c("auc_many_predictors/ncvd_auc_validation_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_test$AUC$score), auc_result_file)
    
}

____________________________________________________________________________________________________________________
Evaluate Random Forest on the training and test data
____________________________________________________________________________________________________________________

In [10]:
%%R
# :::::::::
# R kernel
# :::::::::

library("readxl")
library("writexl")
library("randomForestSRC")
library("riskRegression")
library("survival")
library("cmprsk")
library("prodlim")

cohort <- read_excel("kihd_time_to_event.xlsx")
N_runs <- 10

for (i in seq(1,N_runs,1))
{
  
  kihd_train_file <- paste(c("splitting_658_predictors/training_", i-1, ".xlsx"), collapse = "")
  kihd_test_file <- paste(c("splitting_658_predictors/test_", i-1, ".xlsx"), collapse = "")
  
  kihd_train <- read_excel(kihd_train_file)
  kihd_test <- read_excel(kihd_test_file)
  
  colnames(kihd_train)[1] <- 'index'
  colnames(kihd_test)[1] <- 'index'
  
  kihd_train['time'] <- cohort[kihd_train$index + 1, 'time']/365.25
  kihd_test['time'] <- cohort[kihd_test$index + 1, 'time']/365.25
  
  kihd_train['event'] <- cohort[kihd_train$index + 1, 'event']
  kihd_test['event'] <- cohort[kihd_test$index + 1, 'event']
    
  kihd_train['index'] <- NULL
  kihd_test['index'] <- NULL
    
  ## --------------------------------------------------------------------------------------------------------------------------------
  ## CVD output
  ## --------------------------------------------------------------------------------------------------------------------------------

  rf5 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 5, case.wt = NULL, cause=1)
  
  rf10 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 10, case.wt = NULL, cause=1)
  
  rf15 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 15, case.wt = NULL, cause=1)
  
  rf20 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 20, case.wt = NULL, cause=1)
  
  rf25 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 25, case.wt = NULL, cause=1)
  
  rf30 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 30, case.wt = NULL, cause=1)
    
  score_train<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_train, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
  
  score_test<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_test, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
  
  auc_result_file <- paste(c("auc_many_predictors/cvd_auc_training_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_train$AUC$score), auc_result_file)
  
  auc_result_file <- paste(c("auc_many_predictors/cvd_auc_test_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_test$AUC$score), auc_result_file)
    
    
  ## --------------------------------------------------------------------------------------------------------------------------------
  ## nonCVD output
  ## --------------------------------------------------------------------------------------------------------------------------------

  rf5 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 5, case.wt = NULL, cause=2)
  
  rf10 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 10, case.wt = NULL, cause=2)
  
  rf15 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 15, case.wt = NULL, cause=2)
  
  rf20 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 20, case.wt = NULL, cause=2)
  
  rf25 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 25, case.wt = NULL, cause=2)
  
  rf30 <- rfsrc(Surv(time, event) ~ ., na.action = 'na.impute',
               kihd_train, splitrule = 'logrank', ntree = 250, 
               nodedepth = 30, case.wt = NULL, cause=2)
  
  score_train<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_train, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
    
  score_test<-Score(list(
                         "Random Forest (depth = 5)" = rf5,
                         "Random Forest (depth = 10)" = rf10,
                         "Random Forest (depth = 15)" = rf15,
                         "Random Forest (depth = 20)" = rf20,
                         "Random Forest (depth = 25)" = rf25,
                         "Random Forest (depth = 30)" = rf30
                         ),
                    formula = Hist(time,event)~1,
                    data=kihd_test, times = seq(1,30,1), cause = 1, null.model = FALSE,  
                    plots = "calibration", metrics=c("auc","brier"),
                    summary = "risks")
    
  auc_result_file <- paste(c("auc_many_predictors/ncvd_auc_training_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_train$AUC$score), auc_result_file)
  
  auc_result_file <- paste(c("auc_many_predictors/ncvd_auc_test_", i-1, ".xlsx"), collapse = "")
  write_xlsx(as.data.frame(score_test$AUC$score), auc_result_file)
    
}