In [None]:
library(data.table)
library(plyr)
library(rpart)
library(rpart.plot)
library(keras)

In [None]:
fonction_clean_data_1 <- function(dat) {
  dat$VehGas <- factor(dat$VehGas)     
  dat$n <- 1                            
  dat$ClaimNb <- pmin(dat$ClaimNb, 4)   
  dat$Exposure <- pmin(dat$Exposure, 1) 
  return(dat)
}

Poisson.Deviance <- function(pred, obs){200*(sum(pred)-sum(obs)+sum(log((obs/pred)^(obs))))/length(pred)}

#########  feature pre-processing for GLM
fonction_clean_data_2 <- function(dat2) {
  dat2$AreaGLM <- as.integer(dat2$Area)
  dat2$VehPowerGLM <- as.factor(pmin(dat2$VehPower,9))
  VehAgeGLM <- cbind(c(0:110), c(1, rep(2,10), rep(3,100)))
  dat2$VehAgeGLM <- as.factor(VehAgeGLM[dat2$VehAge+1,2])
  dat2[,"VehAgeGLM"] <-relevel(dat2[,"VehAgeGLM"], ref="2")
  DrivAgeGLM <- cbind(c(18:100), c(rep(1,21-18), rep(2,26-21), rep(3,31-26), rep(4,41-31), rep(5,51-41), rep(6,71-51), rep(7,101-71)))
  dat2$DrivAgeGLM <- as.factor(DrivAgeGLM[dat2$DrivAge-17,2])
  dat2[,"DrivAgeGLM"] <-relevel(dat2[,"DrivAgeGLM"], ref="5")
  dat2$BonusMalusGLM <- as.integer(pmin(dat2$BonusMalus, 150))
  dat2$DensityGLM <- as.numeric(log(dat2$Density))
  dat2[,"Region"] <-relevel(dat2[,"Region"], ref="R24")
  dat2$AreaRT <- as.integer(dat2$Area)
  dat2$VehGasRT <- as.integer(dat2$VehGas)
  return(dat2)
}

# min-max-scaler:
PreProcess.Continuous <- function(var1, dat2){
  names(dat2)[names(dat2) == var1]  <- "V1"
  dat2$X <- as.numeric(dat2$V1)
  dat2$X <- 2*(dat2$X-min(dat2$X))/(max(dat2$X)-min(dat2$X))-1
  names(dat2)[names(dat2) == "V1"]  <- var1
  names(dat2)[names(dat2) == "X"]  <- paste(var1,"X", sep="")
  dat2
}

# pre-procecessing function:
Features.PreProcess <- function(dat2){
  dat2 <- PreProcess.Continuous("Area", dat2)   
  dat2 <- PreProcess.Continuous("VehPower", dat2)   
  dat2$VehAge <- pmin(dat2$VehAge,20)
  dat2 <- PreProcess.Continuous("VehAge", dat2)   
  dat2$DrivAge <- pmin(dat2$DrivAge,90)
  dat2 <- PreProcess.Continuous("DrivAge", dat2)   
  dat2$BonusMalus <- pmin(dat2$BonusMalus,150)
  dat2 <- PreProcess.Continuous("BonusMalus", dat2)   
  dat2$VehBrandX <- as.integer(dat2$VehBrand)-1
  dat2$VehGasX <- as.integer(dat2$VehGas)-1.5
  dat2$Density <- round(log(dat2$Density),2)
  dat2 <- PreProcess.Continuous("Density", dat2)   
  dat2$RegionX <- as.integer(dat2$Region)-1  # char R11,,R94 to number 0,,21
  print(dat2)
}

'81', '37', '71', '28', '70', '54', '59', '17', '79', 
            '30', '10', '97', '45', '39', '30', '91', '87', '56', '50', 
            '98', '44', '78', '23', '50', '16', '61', '7', '28', '89'

In [None]:
data_frame = data.frame(deviance_train = c(1), 
                        deviance_test = c(1), 
                        diff_train = c(1),
                        diff_test = c(1),
                        model = c(1))
#,'81', '37', '71', '28'
for (p in c('62', '81', '37', '71', '28', '70', '54', '59', '17', '79')) {
  print(p)
  str1 = '../input/cvalidation/data_learn_'
  str2 = p
  str3 = '.csv'
  
  result_learn = paste(str1,str2,str3,sep="")
  
  learn <- read.csv(result_learn, header=TRUE, stringsAsFactors = TRUE)
  
  str1_t = '../input/cvalidation1/data_test_'
  str2_t = p
  str3_t = '.csv'
  
  result_test = paste(str1_t,str2_t,str3_t,sep="")
  
  test <- read.csv(result_test, header=TRUE, stringsAsFactors = TRUE)
  
    
  new <- rbind(learn, test)
    
  ## set the seed to make your partition reproducible
  set.seed(62)
  train_ind <- sample(seq_len(nrow(new)), size = floor(0.90 * nrow(new)))
    
  train <- new[train_ind, ]
  test <- new[-train_ind, ]
    
  set.seed(62)
  leanr_ind <- sample(seq_len(nrow(train)), size = floor(0.90 * nrow(train)))
    
  learn <- new[leanr_ind, ]
  valid <- new[-leanr_ind, ]
  
learn <- fonction_clean_data_2(learn)
test <- fonction_clean_data_2(test)

learn <- Features.PreProcess(learn[,c("ClaimNb", "Exposure", "Area","VehPower","VehAge","DrivAge","BonusMalus","VehBrand","VehGas","Density", "Region")])  # keep original variables and fitGLM2 (CANN)
test <- Features.PreProcess(test[,c("ClaimNb", "Exposure", "Area","VehPower","VehAge","DrivAge","BonusMalus","VehBrand","VehGas","Density", "Region")])
  
q0 <- length(c("AreaX", "VehPowerX", "VehAgeX", "DrivAgeX", "BonusMalusX", "VehGasX", "DensityX"))
    
# learning data
Xlearn <- as.matrix(learn[, c("AreaX", "VehPowerX", "VehAgeX", "DrivAgeX", "BonusMalusX", "VehGasX", "DensityX")])  # design matrix learning sample
Brlearn <- as.matrix(learn$VehBrandX)
Relearn <- as.matrix(learn$RegionX)
Ylearn <- as.matrix(learn$ClaimNb)
    
# testing data
Xtest <- as.matrix(test[, c("AreaX", "VehPowerX", "VehAgeX", "DrivAgeX", "BonusMalusX", "VehGasX", "DensityX")])    # design matrix test sample
Brtest <- as.matrix(test$VehBrandX)
Retest <- as.matrix(test$RegionX)
Ytest <- as.matrix(test$ClaimNb)
# choosing the right volumes for EmbNN and CANN
Vlearn <- as.matrix(log(learn$Exposure))
Vtest <- as.matrix(log(test$Exposure))

(lambda.hom <- sum(learn$ClaimNb)/sum(learn$Exposure))
    
# hyperparameters of the neural network architecture (as specified in "01 CANN approach.r") 
q1 <- 20 # Number of neuron in hidden layer 1
q2 <- 15
q3 <- 10
d <- 2   # dimensions embedding layers for categorical features
(BrLabel <- length(unique(learn$VehBrandX))) 
(ReLabel <- length(unique(learn$RegionX)))   

# define the network architecture
Design   <- layer_input(shape = c(q0),  dtype = 'float32', name = 'Design')
VehBrand <- layer_input(shape = c(1),   dtype = 'int32', name = 'VehBrand')
Region   <- layer_input(shape = c(1),   dtype = 'int32', name = 'Region')
LogVol   <- layer_input(shape = c(1),   dtype = 'float32', name = 'LogVol')

BrandEmb = VehBrand %>% 
  layer_embedding(input_dim = BrLabel, output_dim = d, input_length = 1, name = 'BrandEmb') %>%
  layer_flatten(name='Brand_flat')

RegionEmb = Region %>% 
  layer_embedding(input_dim = ReLabel, output_dim = d, input_length = 1, name = 'RegionEmb') %>%
  layer_flatten(name='Region_flat')

Network = list(Design, BrandEmb, RegionEmb) %>% layer_concatenate(name='concate') %>% 
  layer_dense(units=q1, activation='tanh', name='hidden1') %>%
  layer_dense(units=q2, activation='tanh', name='hidden2') %>%
  layer_dense(units=q3, activation='tanh', name='hidden3') %>%
  layer_dense(units=1, activation='linear', name='Network', 
              weights=list(array(0, dim=c(q3,1)), array(log(lambda.hom), dim=c(1))))

Response = list(Network, LogVol) %>% layer_add(name='Add') %>% 
  layer_dense(units=1, activation=k_exp, name = 'Response', trainable=FALSE,
              weights=list(array(1, dim=c(1,1)), array(0, dim=c(1))))

model <- keras_model(inputs = c(Design, VehBrand, Region, LogVol), outputs = c(Response))
model %>% compile(optimizer = optimizer_nadam(), loss = 'poisson')

#summary(model)
    
set.seed(42)
fit <- model %>% fit(list(Xlearn, Brlearn, Relearn, Vlearn), Ylearn, epochs=450, 
                       batch_size=10000, verbose=1, validation_split=0)    
    
# calculating the predictions
learn$fitNNemb <- as.vector(model %>% predict(list(Xlearn, Brlearn, Relearn, Vlearn)))
test$fitNNemb <- as.vector(model %>% predict(list(Xtest, Brtest, Retest, Vtest))) 
    
# defining new row data frame
new_row =  c(Poisson.Deviance(learn$fitNNemb, learn$ClaimNb),
               Poisson.Deviance(test$fitNNemb, test$ClaimNb),
               ((sum(learn$fitNNemb) - sum(learn$ClaimNb))/sum(learn$ClaimNb))*100, 
               ((sum(test$fitNNemb) - sum(test$ClaimNb))/sum(test$ClaimNb))*100, 
               'NNemb')
  
  data_frame <- rbind(data_frame, new_row)
}


In [None]:
data_frame = data_frame[-1,]
data_frame

In [None]:
print(data_frame$deviance_test)
print(data_frame$deviance_train)