In [None]:
library(data.table) # to handle the data in a more convenient manner
library(tidyverse) # for a better work flow and more tools to wrangle and visualize the data
library(C50) # for C5.0 decision tree algorithm
library(gmodels) # for model evaluation
library(plotly)
library(tree) # for improved decision trees
library(listviewer) # for navigating nested/list objects
library(scales) # for formatting numbers
library(IRdisplay) # to help pretty print tables
options(warn = -1) # for suppressing messages

In [None]:
loan <- fread("../data/csv/loan_final313.csv", stringsAsFactors=TRUE)

In [None]:
str(loan)

In [None]:
summary(loan)

In [None]:
loan.dt<-as.data.table(loan)
sapply(loan.dt, function(x) length(unique(x)))
a <- as.data.table(sapply(loan.dt, function(x) length(unique(x))))
b <- cbind(names(loan.dt),a) 
colnames(b)<- c("name","count")
loan.dt[,b[count==1,name]:=NULL]
loan.dt[,id:=NULL]
loan.dt[,c("home_ownership_cat","income_cat","term_cat","purpose_cat","interest_payment_cat",
        "loan_condition_cat","grade_cat"):=NULL]
loan.dt[,c("issue_d","final_d"):=NULL]

In [None]:
loan_num <- loan %>% purrr::keep(is.numeric)
summaries <- loan_num %>%
  summary() %>% # get statistical summaries
  apply(1, function(x) stringr::str_extract(x, "(?<=:).+") %>% as.numeric) %>%
  magrittr::set_colnames(names(summary(1))) %>% # set column names
  magrittr::set_rownames(names(loan_num)) # set row names

summaries

In [None]:
plotly::plot_ly(loan, x = ~loan_condition, type = "histogram")

In [None]:
loan.dt[,!"loan_condition"] %>% # select columns
  tidyr::gather() %>% # reshape into long format in columns "key" and "value"
  ggplot(aes(value)) + # plot value
  facet_wrap(~ key, scale = "free" ) + # divide into separate plots by key
  geom_density(fill = "green")  # get density plots

In [None]:
loan.dt1<-loan.dt %>% purrr::keep(is.numeric)

In [None]:
cor(loan.dt1) %>%
  corrplot::corrplot.mixed(upper = "ellipse",
                           lower = "number",
                           tl.pos = "lt",
                           number.cex = .5,
                           lower.col = "blue",
                           tl.cex = 0.7)

In [None]:
set.seed(1903)
train <- loan.dt[ ,sample(.I, .N * 3/4)]
fitc1 <- C50::C5.0(loan_condition~ ., data = loan.dt[train])
fitc <- C50::C5.0(loan.dt[train, -"loan_condition"], loan.dt[train,loan_condition])
fitc

In [None]:
loan_train <- loan.dt[train]
loan_test <- loan.dt[-train]

In [None]:
plot(fitc1)

In [None]:
summary(fitc1)

In [None]:
fitc_rules <- C50::C5.0(  loan_condition ~., 
                         data = loan.dt[train ,],
                         rules = TRUE )
summary(fitc_rules)

In [None]:
predc_train <- predict(fitc,
                       newdata = loan.dt[train],
                       type = "class")

In [None]:
dt_ct6 <- gmodels::CrossTable(loan.dt[train, loan_condition],
                              predc_train,
                              prop.chisq = FALSE,
                              prop.c = FALSE,
                              prop.r = FALSE,
                              dnn = c('actual default', 'predicted default'))

In [None]:
predc <- predict(fitc,
                 newdata = loan.dt[-train],
                 type = "class")

In [None]:
dt_ct7 <- gmodels::CrossTable(loan.dt[-train, loan_condition],
                              predc,
                              prop.chisq = FALSE,
                              prop.c = FALSE,
                              prop.r = FALSE,
                              dnn = c('actual default', 'predicted default'))

In [None]:
fit <- tree::tree( loan_condition ~.,
                  data = loan.dt[train],
                  split = "deviance" )
plot(fit)
text(fit)

In [None]:
summary(fit)

In [None]:
pred <- predict(fit,
                newdata = loan.dt[-train])
pred

In [None]:
pred_class <- colnames(pred)[max.col(pred,
                                     ties.method = c("random"))]

In [None]:
dt_ct8 <- gmodels::CrossTable(loan.dt[-train, loan_condition],
                              pred_class,
                              prop.chisq = FALSE,
                              prop.c = FALSE,
                              prop.r = FALSE,
                              dnn = c('actual default', 'predicted default'))

In [None]:
loan_boost10 <- C5.0(loan_train[,-"loan_condition"],
                       loan_train$loan_condition,
                       trials = 10)

In [None]:
loan_boost10

In [None]:
summary(loan_boost10)

In [None]:
loan_result_boost10 <- predict(loan_boost10, loan_train)

In [None]:
ct_dt3 <- gmodels::CrossTable(loan_train$loan_condition, loan_result_boost10, prop.chisq = F, prop.c = F, prop.r = F,
dnn = c('actual default', 'predicted default'))

ct_dt3

In [None]:
ct_dt3$prop.tbl %>%
    diag() %>%
    sum() %>%
    scales::percent(accuracy = 0.01)

In [None]:
loan_pred_boost10 <- predict(loan_boost10, loan_test)

In [None]:
ct_dt4 <- gmodels::CrossTable(loan_test$loan_condition, loan_pred_boost10, prop.chisq = F, prop.c = F, prop.r = F,
dnn = c('actual default', 'predicted default'))

In [None]:
ct_dt4

In [None]:
ct_dt4$t %>% knitr::kable() %>% as.character() %>% IRdisplay::display_html()

In [None]:
ct_dt4$prop.tbl %>%
    diag() %>%
    sum() %>%
    scales::percent(accuracy = 0.01)