# <center> XGBoost using SparklyR with OHE in R


## Importing required functions & libraries

In [6]:
library(sparklyr)
library(dplyr)
library(DBI) #To use sequel commands
library(stringr)
library(xgboost)
library(data.table)
library(pROC)

## Setup and configure spark connect

In [7]:
config=spark_config()
#config$sparklyr.cores.local <- 4
config$spark.driver.cores <- 1
config$spark.driver.memory <- "2G"
config$spark.executor.cores <- 3
config$spark.executor.memory <- "6G"
config$spark.dynamicAllocation.maxExecutors <- 200

In [8]:
sc=spark_connect(master = "local", config = config)

In [9]:
print(paste("Driver cores =",sc$config$spark.driver.cores))
print(paste("Executor cores =",sc$config$spark.executor.cores))

[1] "Driver cores = 1"
[1] "Executor cores = 3"


## Import Required Dataset

In [10]:
t_import=Sys.time()
df <- read.csv("Delay_20k.csv",header = T,sep = ",")
df_tbl <- copy_to(sc,df,"df_sc")
# The copy_to function copys the local data frame to a spark data table
#iris_preview <- dbGetQuery(sc, "SELECT * FROM iris LIMIT 10")
#spark_read_csv - To upload csv file in hdfs
#flights_tbl %>% filter(dep_delay == 2) - Can use dplyr on tables in cluster

In [11]:
df_tbl

# Source:   table<df_sc> [?? x 30]
# Database: spark_connection
        X  Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime
    <int> <int> <int>      <int>     <int>   <int>      <int>   <int>      <int>
 1 1.33e6  2008     3         29         6    1435       1425    1550       1530
 2 5.34e6  2008     9         27         6    1343       1320    1641       1600
 3 5.60e6  2008    10         31         5    1205       1159    1419       1425
 4 5.94e6  2008    10         12         7    1716       1702    2031       2030
 5 2.38e6  2008     4         12         6    1516       1440    1801       1730
 6 4.57e6  2008     8         15         5    1656       1441    1936       1720
 7 3.71e6  2008     7         27         7    2238       2017    2351       2140
 8 4.12e6  2008     7          1         2    1115       1005    1303       1205
 9 4.90e5  2008     1         16         3    1743       1440    1915       1610
10 4.53e6  2008     8         29         5   

In [12]:
colnames(df_tbl)

## Data Preparation / Feature Engineering

### Remove the unwanted columns and creating dependent column

In [13]:
t_Data_Prep=Sys.time()
#columns required to build model
selected_columns=c("Origin", "Dest", "Distance", "Month", "DayOfWeek", "UniqueCarrier", "Dep_Hour", "DepDelay_flag")

# Create flag base on delay in departure and filter the required columns
df_tbl <- df_tbl %>% 
  mutate(DepDelay_flag=ifelse(DepDelay >= 15, 1, 0),
         Dep_Hour=ifelse(nchar(CRSDepTime)==4,substr(CRSDepTime,1,2),ifelse(nchar(CRSDepTime)==3,paste("0",substr(CRSDepTime,1,1),sep=""),"00"))) %>%
  select(selected_columns)

### One hot encoding in R

In [14]:
#Creating R data frame from spark data frame
all_data <- as.data.frame(df_tbl)
label <- "DepDelay_flag"

#One hot encoding
new_data <- model.matrix(~.+0,data = all_data[,colnames(all_data)!=label])

data_df <- as.data.frame(new_data)
data_df[,label] <- all_data[,label]

## Model Development

### Split the dataset into train & test

In [15]:
t_model=Sys.time()

#Split data into train & test
dt = sort(sample(nrow(data_df), nrow(data_df)*.7))

train_df <- data_df[dt,]
test_df <- data_df[-dt,]

### Create matrix to implement xgboost

In [16]:
new_train <- as.matrix(train_df[,colnames(train_df)!=label])
new_test <- as.matrix(test_df[,colnames(test_df)!=label])

#Convert into dense matrix
dtrain <- xgb.DMatrix(data = new_train,label = train_df[,label]) 
dtest <- xgb.DMatrix(data = new_test,label=test_df[,label])

### Fit training data into model

In [17]:
parameters <- list(booster = "gbtree",
                   objective = "binary:logistic",
                   nthread=4,
                   max.depth=5,
                   subsample=0.8,
                   eta=0.2)

model_xgboost <- xgboost(data=dtrain,
                         params = parameters,
                         nrounds = 60,
                         eval_metric="auc",
                         verbose = F)
max(model_xgboost$evaluation_log$train_auc)

### Predict using test data & check AUC value

In [18]:
pred <- predict(model_xgboost,dtest)
roc_curve <- roc(test_df[,label],pred,auc = T)
Auc <- auc(roc_curve)
print(paste("Test Area Under ROC:",Auc))
t_end=Sys.time()

[1] "Test Area Under ROC: 0.592860310017562"


## Exporting results

In [19]:
Overall_time <- t_end-t_import
Data_Extraction_time <- t_Data_Prep-t_import
Data_Preparation_time <- t_model-t_Data_Prep
Model_time <- t_end-t_model
units(Overall_time) <- "secs"
units(Data_Extraction_time) <- "secs"
units(Data_Preparation_time) <- "secs"
units(Model_time) <- "secs"

Accuracy <- Auc

result_df <- cbind(Overall_time,Data_Extraction_time,Data_Preparation_time,Model_time,Accuracy)

write.csv(result_df,"Results_XGB_R_20k_v1.csv")