## Neural Networks

First, we have to prepare the data :

In [1]:
data <- read.csv(file.path("..", "data", "training_data.csv"))
data<-data[,-which(names(data) %in% c("SWEETORSOUR"))] 
data$Intensity<-as.numeric(as.factor(data$Intensity))
var0<-which(apply(data, 2, var) == 0)                              #this can be reused when preparing future test data
data<-data[ ,-var0] 

set.seed(199)
idx.train <- sample(nrow(data), nrow(data)*2/3)                    #separate data into training and validation set (using the validation set approach)
data.train <- data[idx.train,]
datax.train<-data.train[,-which(names(data.train) %in% c("VALENCE.PLEASANTNESS"))]  #separate between the response and the predictors
datay.train<-data.train$VALENCE.PLEASANTNESS

data.test <- data[-idx.train,]
datax.test<-data.test[,-which(names(data.test) %in% c("VALENCE.PLEASANTNESS"))]
datay.test<-as.matrix(data.test$VALENCE.PLEASANTNESS)
#using more data samples

#verifiy if some new cols now have var 0
train.var<-(apply(datax.train, 2, var) != 0)
test.var<-(apply(datax.test,2,var)!=0)

datax.train<-datax.train[,which(test.var&train.var)] #take the same columns for both sets and only when both have var!=0
datax.test<-datax.test[,which(test.var&train.var)]
ncol(data.test)

datax.train<-as.matrix(datax.train)       #transform into matrix for neuron networks
datax.test<-as.matrix(datax.test)

First we define the model of neural network with the possibility to add regularizer or callback

In [5]:
library(keras)
use_condaenv("r-tensorflow")

#baseline model
fitnn<-function(datax.train,datay.train,datax.test,datay.test,callback=NULL, kernel_regularizer=NULL, print=FALSE,printresult=TRUE){ #If we want to have more detail of the process we can print plot (print=True or printresult to show train and test error)
    use_session_with_seed(24)                              #enables to have reproducibility
    set.seed(24)
    model <- keras_model_sequential() %>% 
       layer_dense(units=64, activation="relu",kernel_regularizer=kernel_regularizer, input_shape=ncol(datax.train)) %>% 
       layer_dense(units=64, activation="relu", kernel_regularizer=kernel_regularizer) %>% 
       layer_dropout(rate=0.6)%>%
       layer_dense(units=32, activation = "relu",kernel_regularizer=kernel_regularizer) %>% 
       layer_dropout(rate=0.3) %>%                              #dropout layers prevent overfitting to the model
       layer_dense(units=1, activation="linear",kernel_regularizer=kernel_regularizer)

    model %>% compile(
       loss = "mse",
       optimizer =  "adam", #this tunes the learning rate
     )
    if(print)model %>% summary()
    history<-model %>% fit(datax.train, 
                           datay.train,
                           epochs=100,          #try to keep the training RMSE over the validation and epochs stop before 100 if we use a callback
                           callback=callback,
                           verbose = 0,
                           batch_size =100,      #reducing batch_size reduces the RMSE (but increases computation time)
                           validation_split = 0.2)
    scores = model %>% evaluate(datax.train, datay.train, verbose = 0)
    if(print) print(scores)
    
    #use the model
    
    ypred <- model %>% predict(datax.train)
    ypred.test<- model %>% predict(datax.test)
    
    #estimate training and test error
    
    training_RMSE<-sqrt(mean(ypred-datay.train)^2)
    test_RMSE<-sqrt(mean(ypred.test-datay.test)^2)
    if(printresult){
        print(paste0("Training RMSE:", training_RMSE))
        print(paste0(" Test RMSE:", test_RMSE))
    }
    
    if(print) show(plot(history))
    
    return (test_RMSE) #useful for cross-validation
}

Because the neuron networks have some intrinsinc variability, we will compute an estimate of the test error by 10-fold cross-validation.

In [4]:
#define cross-validation function

library(tidymodels)

validation_data <- vfold_cv(data, v = 10) # create the 10 folds

nn_fit_and_evaluate <- function(fold,callbacknn,kernel_regularizernn) {
    #extract and prepare data
    trainingx<-as.matrix(analysis(fold)[,-which(names(data.test) %in% c("VALENCE.PLEASANTNESS"))]) #analysis(fold) return the training data for the fold, we separate them into x and y to meet the function requirement
    trainingy<-as.matrix(analysis(fold)$VALENCE.PLEASANTNESS)
    validation_setx <- as.matrix(assessment(fold)[,-which(names(data.test) %in% c("VALENCE.PLEASANTNESS"))]) # the function `assessment` extracts the validation set from the fold
    validation_sety<-as.matrix(assessment(fold)$VALENCE.PLEASANTNESS)
    
    #fit model
    fitnn(datax.train=trainingx,datay.train=trainingy,validation_setx,validation_sety,printresult=FALSE,callback=callbacknn, kernel_regularizer=kernel_regularizernn) # the function `analysis` extracts the training set from the fold (marked blue in the slides)
}

#estimate test error on 10 folds and return the mean
cross_validation_error<-function(validation_data,callbacknn=NULL,kernel_regularizernn=NULL){
    mean(sapply(validation_data$splits, nn_fit_and_evaluate,callbacknn,kernel_regularizernn))
}


And we can compute the mean RMSE while testing with early stopping, L1 and L2 regularization.

In [6]:
#simple
print("Simple model")
cross_validation_error(validation_data)

#and now using an early stopping
print("callback")
cross_validation_error(validation_data,callback=callback_early_stopping(monitor = "val_loss", patience = 10))#the results are clearly better with early stopping

#Now try with kernel regularizer l1
print("regularizer l1")
cross_validation_error(validation_data,kernel_regularizer="l1")

#And with kernel regularizer l2
print("regularizer l2")
cross_validation_error(validation_data, kernel_regularizer=regularizer_l2(l = .1))

#callback and regularizer
print("callback and regularizer l1")
cross_validation_error(validation_data,callback=callback_early_stopping(monitor = "val_loss", patience = 10),kernel_regularizer="l1")

[1] "Simple model"


Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)



[1] "callback"


Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)



[1] "regularizer l1"


Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)



[1] "regularizer l2"


Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)



[1] "callback and regularizer l1"


Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)

Set session seed to 24 (disabled GPU, CPU parallelism)



We created the model based in the whole data with the regularization and callback that were determined by cross-validation: 

In [7]:
set.seed(24)
use_session_with_seed(24)

#take the whole data set to have more information
datax<-as.matrix(data[,-which(names(data) %in% c("VALENCE.PLEASANTNESS"))])
datay<-data$VALENCE.PLEASANTNESS

#use the model defined previously
model_tot <- keras_model_sequential() %>% 
   layer_dense(units=64, activation="relu", input_shape=ncol(datax),kernel_regularizer="l1") %>% 
   layer_dense(units=64, activation="relu",kernel_regularizer="l1") %>% 
   layer_dropout(rate=0.6)%>%
   layer_dense(units=32, activation = "relu",kernel_regularizer="l1") %>% 
   layer_dropout(rate=0.3) %>%                              #dropout layers prevent overfitting to the model
   layer_dense(units=1, activation="linear")


model_tot %>% compile(
   loss = "mse",
   optimizer =  "adam", 
)

model_tot %>% summary()
history<-model_tot %>% fit(datax, 
                       datay,
                       epochs=30,
                       callback=callback_early_stopping(monitor = "val_loss", patience = 10), #we obtained good results with early stopping
                       verbose = 0,
                       batch_size =100,     
                       validation_split = 0.2)
scores = model_tot %>% evaluate(datax, datay, verbose = 0)


Set session seed to 24 (disabled GPU, CPU parallelism)



________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
dense_3 (Dense)                     (None, 64)                      193856      
________________________________________________________________________________
dense_2 (Dense)                     (None, 64)                      4160        
________________________________________________________________________________
dropout_1 (Dropout)                 (None, 64)                      0           
________________________________________________________________________________
dense_1 (Dense)                     (None, 32)                      2080        
________________________________________________________________________________
dropout (Dropout)                   (None, 32)                      0           
________________________________________________________________________________
dense (Dense)               

We achieved relatively good results but the kaggle submission was not satisfying so we decided to further investigate tree methods.