# Packages 🗃

In [73]:
install_packages <- function(){
  install.packages("caret")
  install.packages("rpart")
  install.packages("dplyr") 
}
install_packages()

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [74]:
global_imports <- function()
{
  require("caret")
  require("rpart")
  require("dplyr")
}
global_imports()

Loading required package: rpart



# Loading the data 📲

In [75]:
db <- read.csv2("training.csv",sep=',',dec='.')

In [76]:
Xdata = within(db, rm('claim_amount'))
ydata = db['claim_amount']

In [77]:
preprocess_X_data <- function (x_raw){
	# Data preprocessing function: given X_raw, clean the data for training or prediction.

	# Parameters
	# ----------
	# X_raw : Dataframe, with the columns described in the data dictionary.
	# 	Each row is a different contract. This data has not been processed.

	# Returns
	# -------
	# A cleaned / preprocessed version of the dataset

  # YOUR CODE HERE ------------------------------------------------------

  # picking features to remove
  #to_remove <- c("id_policy","year","drv_sex2","drv_age2","drv_age_lic2","vh_make_model","vh_weight","drv_age_lic1","vh_value","pol_no_claims_discount","pol_pay_freq","vh_type","vh_fuel")
  #X <- x_raw[names(x_raw)[-which(names(x_raw)%in%to_remove)]]
  X <-  dplyr::select( x_raw , -c("id_policy","year","drv_sex2","drv_age2","drv_age_lic2","vh_make_model","vh_weight","drv_age_lic1","vh_value","pol_no_claims_discount","pol_pay_freq","vh_type","vh_fuel"))
  # dealing with missing data
  imp <- preProcess(X, method = "medianImpute")
  X_clean<- predict(imp, X)

  # dealing with categorial variables
    X_clean = X_clean %>% 
        mutate(pol_coverage = factor(pol_coverage)) %>%
        mutate(pol_usage=as.factor(ifelse(pol_usage=="AllTrips","Professional",as.character(pol_usage)))) %>% 
        mutate( drv_sex1 = factor(drv_sex1)) %>% 
        mutate( drv_drv2 = factor(drv_drv2)) %>%
        mutate( pol_payd = factor(pol_payd)) 
       
   
  # dealing with continuous variables
  # drv_age1 is correlated to drv_age_lc1. also vh_weigth and vh_value. we have to choose onlyone of them

  # Binerizsation
  #cuts_vh_age = quantile(X_clean$vh_age,seq(0, 1, 0.25))
  #X_clean$vh_age_factor = cut(X_clean$vh_age, breaks=cuts_vh_age)

  #cuts_drv_age = quantile(X_clean$drv_age1,seq(0, 1, 0.25))
  #X_clean$drv_age_factor = cut(X_clean$drv_age1, breaks=cuts_drv_age)

  #X_clean <- dplyr::select(X_clean,-drv_age1)
 
  remove(list = c("X","imp")) 
  # ---------------------------------------------------------------------
  return(X_clean)
}

In [78]:
res<-preprocess_X_data(Xdata)

In [79]:
set.seed(432)
trainIndex<- createDataPartition(1:nrow(X_clean), p = 0.8,list = FALSE) 
df = data.frame(y_clean, X_clean)
X_train<-X_clean[trainIndex,]
y_train<-y_clean[trainIndex]
X_test<-X_clean[-trainIndex,]
y_test<-y_clean[-trainIndex]

## Define the training logic

In [80]:
fit_model <- function (x_raw, y_raw){  # ! 
	# Model training function: given training data (X_raw, y_raw), train this pricing model.

	# Parameters
	# ----------
	# X_raw : Dataframe, with the columns described in the data dictionary.
	# 	Each row is a different contract. This data has not been processed.
	# y_raw : a array, with the value of the claims, in the same order as contracts in X_raw.
	# 	A one dimensional array, with values either 0 (most entries) or >0.

	# Returns
	# -------
	# self: (optional), this instance of the fitted model.

	
  # This function trains your models and returns the trained model.
  
  # YOUR CODE HERE ------------------------------------------------------

  X_clean = preprocess_X_data(x_raw)  # preprocess your data before fitting
  df = data.frame(y_raw, X_clean)
  rm(X_clean)
  # training frequency model (i.e. how often do claims occur)
  trained_model_frequency = glm(I(claim_amount>0) ~ .,
                                family = binomial,
                                offset= log(pol_duration),
                                data = df)                              

  # training a mean severity model (i.e. what is the average claim cost)
  trained_model_avg_severity = lm(log(claim_amount) ~ 1,
                                data = df,
                                offset =log(pol_duration),
                                subset = claim_amount>0)
  rm(df)
  
  # defining a list and putting the trained models in there
  trained_model = list(occurence = trained_model_frequency,
                       cost = trained_model_avg_severity)

  # ---------------------------------------------------------------------
  # The result trained_model is something that you will save in the next section
  
  return(trained_model)
}

In [81]:
model = fit_model(Xdata, ydata)

## Saving your model

In [82]:
save_model <- function(model){
  # Saves this trained model to a file.

  # This is used to save the model after training, so that it can be used for prediction later.

  # Do not touch this unless necessary (if you need specific features). If you do, do not
  #  forget to update the load_model method to be compatible.
	
  # The default is to save it in 'trained_model.RData', but change this if you 
  # are using a pacakge that requires a different saving format.
  # For h2o models see this discussion: https://discourse.aicrowd.com/t/any-tips-for-successfully-submitting-an-h2o-model/4194/

  save(model, file='trained_model.RData')
}

In [83]:
load_model <- function(){ 
 # Load a saved trained model from the file `trained_model.RData`.
 # This is called by the server to evaluate your submission on hidden data.


 # Only modify this *if* you modified save_model.

  load('trained_model.RData')
  return(model)
}

In [85]:
#model = load_model(MODEL_OUTPUT_PATH)

# Predicting the claims 💵

The second function, `predict_expected_claim`, takes your trained model and a dataframe of contracts, and outputs a prediction for the (expected) claim incurred by each contract. This expected claim can be seen as the probability of an accident multiplied by the cost of that accident.

This is the function used to compute the _RMSE_ leaderboard, where the model best able to predict claims wins.

In [86]:
predict_expected_claim <- function(model, x_raw){
	# Model prediction function: predicts the average claim based on the pricing model.

	# This functions estimates the expected claim made by a contract (typically, as the product
	# of the probability of having a claim multiplied by the average cost of a claim if it occurs),
	# for each contract in the dataset X_raw.

	# This is the function used in the RMSE leaderboard, and hence the output should be as close
	# as possible to the expected cost of a contract.

	# Parameters
	# ----------
	# X_raw : Dataframe, with the columns described in the data dictionary.
	# 	Each row is a different contract. This data has not been processed.

	# Returns
	# -------
	# avg_claims: a one-dimensional array of the same length as X_raw, with one
	#     average claim per contract (in same order). These average claims must be POSITIVE (>0).


  # YOUR CODE HERE ------------------------------------------------------

  X_clean = preprocess_X_data(x_raw)  # preprocess your data before fitting

  expected_frequency = predict(model$occurence, newdata = X_clean, type= "response")
  expected_severity = exp(predict(model$cost, newdata = X_clean, type= "response"))

  expected_claims = expected_frequency * expected_severity
  rm(X_clean)

  return(expected_claims)  
}

In [87]:
claims <- predict_expected_claim(model, Xdata)

# Pricing contracts 💰

The third and final function, `predict_premium`, takes your trained model and a dataframe of contracts, and outputs a _price_ for each of these contracts. **You are free to set this prices however you want!** These prices will then be used in competition with other models: contracts will choose the model offering the lowest price, and this model will have to pay the cost if an accident occurs.

This is the function used to compute the _profit_ leaderboard: your model will participate in many markets of size 10, populated by other participants' model, and we compute the average profit of your model over all the markets it participated in.

In [88]:
predict_premium <- function(model, x_raw){
  # Model prediction function: predicts premiums based on the pricing model.

  # This function outputs the prices that will be offered to the contracts in X_raw.
  # premium will typically depend on the average claim predicted in 
  # predict_expected_claim, and will add some pricing strategy on top.

  # This is the function used in the average profit leaderboard. Prices output here will
  # be used in competition with other models, so feel free to use a pricing strategy.

  # Parameters
  # ----------
  # X_raw : Dataframe, with the columns described in the data dictionary.
  # 	Each row is a different contract. This data has not been processed.

  # Returns
  # -------
  # prices: a one-dimensional array of the same length as X_raw, with one
  #     price per contract (in same order). These prices must be POSITIVE (>0).


  # YOUR CODE HERE ------------------------------------------------------
  
  return(predict_expected_claim(model, x_raw) * 1.5) # boost prices by 50%
}


In [89]:
prices <- predict_premium(model, Xdata)
as.matrix(head(prices))

0,1
1,64.030718
2,77.55613
3,8.474777
4,87.795035
5,10.669359
6,91.077175


#### Profit on training data

In order for your model to be considered in the profit competition, it needs to make nonnegative profit over its training set. You can check that your model satisfies this condition below:

In [90]:
print(paste('Income:', sum(prices,na.rm = TRUE)))
print(paste('Losses:', sum(ydata)))

if (sum(prices,na.rm = TRUE) < sum(ydata)) {
    print('Your model loses money on the training data! It does not satisfy market rule 1: Non-negative training profit.')
    print('This model will be disqualified from the weekly profit leaderboard, but can be submitted for educational purposes to the RMSE leaderboard.')
} else {
    print('Your model passes the non-negative training profit test!')
}

[1] "Income: 29767749.8917614"
[1] "Losses: 26057988.08"
[1] "Your model passes the non-negative training profit test!"
