# Lecture 4 - Nested CV for CART model

## Load the CCHS data form epi7913A package

In [1]:
library(magrittr)

full_data<-epi7913A::cchs %>% dplyr::slice_sample(prop=0.1)
head(full_data)

Unnamed: 0_level_0,age,sex,CANHEARTbin,householdsize,education,maritalstatus,immigration,houseincome
Unnamed: 0_level_1,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<int>,<fct>
1,3,1,1,5,2,3,0,1
2,5,2,1,1,4,2,0,2
3,3,2,1,2,4,3,0,3
4,7,1,0,2,4,3,0,1
5,4,2,0,4,1,2,0,2
6,3,1,0,5,4,3,1,1


## Define the outcome variable

In [3]:
# define the outcome variable
voutcome <- "CANHEARTbin"

## Iterate using 5-fold cross validation (outer loop) calling the 5-fold CV to tune the model each time (inner loop)

In [4]:
# create a cluster
cl<-parallel::makeCluster(5)

# export variables to all nodes in cluster
parallel::clusterExport(cl, c("full_data", "voutcome"),
                                       envir = environment() )

ll.mean<-mean(parallel::parSapply(cl, caret::createFolds(full_data[, voutcome], k=5), function(x)
{
  testInds <- x
  trnInds <- setdiff(1:nrow(full_data), testInds)
  
  train_data <- full_data[trnInds,] 
  test_data <- full_data[testInds,]
  
  best_model<-sdgm::cart.bestmodel.bin(train_data, voutcome, n_iter=5)

  preds<-predict(best_model, test_data)
  
  if (!is.null(preds))
  {
    test_ll<- MLmetrics::LogLoss(preds, test_data[,voutcome] ) 
  } else  {
    test_ll<-NA
    print("Logloss calculation failed")
  }
}))
parallel::stopCluster(cl)

print(ll.mean, na.rm=T)

[1] 0.6133663


## Now train the final model

In [6]:
  final_model<-sdgm::cart.bestmodel.bin(full_data, voutcome, n_iter=5)

[1] "Logloss on CCHS Data is: 0.622235019456334"
