In [7]:
# devtools::install_url("https://github.com/catboost/catboost/releases/download/v0.20/catboost-R-Windows-0.20.tgz",
#     INSTALL_opts = c("--no-multiarch", "--no-test-load"))

In [8]:
library(dplyr)
# https://medium.com/ampersand-academy/how-to-create-regression-model-using-catboost-package-in-r-programming-6cce3805a5e1
library(mlbench)
library(catboost)
library(caret)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




## import data

In [9]:
ml_data <- read.csv('ml_predict_data.csv')

In [10]:
# assign countyname col to ml_data idx
row.names(ml_data) <- ml_data$X
# drop X col
drops <- c('X')
ml_data <- ml_data[, !(names(ml_data) %in% drops)]

## split to test and train

In [11]:
# Split out validation dataset
# create a list of 80% of the rows in the original dataset we can use for training
set.seed(1)
validation_index <- createDataPartition(ml_data$avg_perc_change, p=0.80, list=FALSE)
# select 20% of the data for validation
testing <- ml_data[-validation_index,]
# use the remaining 80% of data to training and testing the models
training <- ml_data[validation_index,]

In [12]:
y_train <- unlist(training[c('avg_perc_change')])
X_train <- training %>% select(-avg_perc_change)

y_test <- unlist(testing[c('avg_perc_change')])
X_test <- testing %>% select(-avg_perc_change)

## convert to catboost specified formt

In [13]:
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_test, label = y_test)

## hyperparmeters

In [14]:
params <- list(iterations = 500, learning_rate = 0.01, depth = 10, loss_function = "RMSE",
    eval_metric = "RMSE", random_seed = 0, od_type = "Iter", metric_period = 50,
    od_wait = 20, use_best_model = TRUE)

## train the model

In [15]:
model <- catboost.train(learn_pool = train_pool, params = params)

You should provide test set for use best model. use_best_model parameter has been switched to false value.
0:	learn: 0.1690426	total: 185ms	remaining: 1m 32s
50:	learn: 0.1538577	total: 2.69s	remaining: 23.7s
100:	learn: 0.1411900	total: 5.01s	remaining: 19.8s
150:	learn: 0.1302376	total: 7.41s	remaining: 17.1s
200:	learn: 0.1206829	total: 9.94s	remaining: 14.8s
250:	learn: 0.1124546	total: 12.6s	remaining: 12.5s
300:	learn: 0.1051598	total: 15s	remaining: 9.93s
350:	learn: 0.0985117	total: 17.4s	remaining: 7.38s
400:	learn: 0.0925348	total: 19.7s	remaining: 4.87s
450:	learn: 0.0873162	total: 22.2s	remaining: 2.41s
499:	learn: 0.0825775	total: 24.5s	remaining: 0us


## make predictions

In [16]:
#predict
y_pred=catboost.predict(model,test_pool)

In [20]:
postResample(y_pred,testing$avg_perc_change)

## naive_roc_auc_score 

In [17]:
naive_roc_auc_score <- function(y_true, y_pred) {
    num_same_sign <- 0
    num_pairs <- 0 
    
    for(a in 1:length(y_true))
    {
        for(b in 1:length(y_true))
        {
            if (y_true[a] > y_true[b]) {
                
                num_pairs <- num_pairs + 1
                
                if (y_pred[a] > y_pred[b]) {
                
                    num_same_sign <- num_same_sign + 1
                }
                else if (y_pred[a] == y_pred[b]) {
                    
                    num_same_sign <- num_same_sign + .5
                }
            }
        }
    }
    num_same_sign / num_pairs
}

In [19]:
naive_roc_auc_score(y_test, y_pred)