In [None]:
library(data.table) # to handle the data in a more convenient manner
library(tidyverse) # for a better work flow and more tools to wrangle and visualize the data
library(tm) # for text mining
library(SnowballC) # for word stemming
library(gridExtra) # for multiple plots
library(wordcloud) # visualize text data
library(RColorBrewer) # for beautifying visualizations with custom colors
library(e1071) # for naive bayes
library(gmodels) # model evaluation
library(knitr) # for better table printing
library(kableExtra) # for better table printing
library(scales) # for formatting numbers
library(magrittr) # tools for better handling data structures
library(purrr) # tools for better handling data structures
library(IRdisplay) # printing html tables from kable
options(warn = -1) # for suppressing messages
library(plotly)
library(data.table) # to handle the data in a more convenient manner
library(tidyverse) # for a better work flow and more tools to wrangle and visualize the data
library(BBmisc) # for easy normalization of data
library(class) # for kNN classification algorithm 
#library(knnGarden) # knn classification algorithm
library(gmodels) # for model evaluation
library(plotly) # for interactive visualization
library(rebmix) # to load necessary data
library(corrplot) # for correlation plots
library(reshape2) # to melt data for boxplots
options(warn=-1) # for suppressing messages

In [None]:
loan.dt <- fread("../data/csv/loan_final313.csv", stringsAsFactors=TRUE)

In [None]:
str(loan.dt)

In [None]:
loan <- copy(loan.dt)
sapply(loan, function(x) length(unique(x)))
a <- as.data.table(sapply(loan, function(x) length(unique(x))))
b <- cbind(names(loan),a) 
colnames(b)<- c("name","count")

In [None]:
loan[,b[count==1,name]:=NULL]
loan[,id:=NULL]
loan[,c("home_ownership_cat","income_cat","term_cat","purpose_cat","interest_payment_cat",
        "loan_condition_cat","grade_cat"):=NULL]
loan[,c("issue_d","final_d"):=NULL]

In [None]:
loan_factors <- loan %>% purrr::keep(is.factor) %>% # select factor columns
    tidyr::gather() %>% # convert into long format for faceting
    ggplot(aes(x = value)) + # plot value
    facet_wrap(~ key, scales = "free") + # divide into separate plots by key
    geom_bar()

plotly::ggplotly(loan_factors)

In [None]:
loan %>% purrr::keep(is.numeric) %>% # select columns
    tidyr::gather() %>% # reshape into long format in columns "key" and "value"
    ggplot(aes(value)) + # plot value
        facet_wrap(~ key, scale = "free" ) + # divide into separate plots by key
        geom_density(fill = "green")  # get density plots

In [None]:
set.seed(2018)
loan1 <- loan[,-c("loan_condition")]
train <- loan1[,sample(.I, .N*3/4)]
labels <- factor(loan$loan_condition, labels = c("bad", "good"))

In [None]:
fit1 <- e1071::naiveBayes(formula = labels[train] ~ ., data = loan1[train])

In [None]:
fit1

In [None]:
pred_probs <- predict(fit1, loan1[-train], type = "raw")
pred_percent <- pred_probs %>% apply(2, scales::percent, accuracy = 0.01)
pred_percent

In [None]:
labs <- colnames(pred_percent)[max.col(pred_probs)]

pred_percent %>%
    magrittr::set_rownames(labs)

In [None]:
pred <- predict(fit1, loan1[-train], type = "class")
pred

In [None]:
identical(labs, as.character(pred))

In [None]:
result <- caret::confusionMatrix(pred, labels[-train])
result

In [None]:
str(result)

In [None]:
result$overall[1]

In [None]:
loan1 %>% purrr::keep(is.numeric) %>% 
cor() %>%
corrplot::corrplot.mixed(upper = "ellipse",
                         lower = "number",
                         tl.pos = "lt",
                         number.cex = .5,
                         lower.col = "black",
                         tl.cex = 0.7)

In [None]:
set.seed(2018)
loan2 <- loan[,-c("loan_condition","loan_amount")]
train2 <- loan2[,sample(.I, .N*3/4)]
labels <- factor(loan$loan_condition, labels = c("bad", "good"))
fit2 <- e1071::naiveBayes(formula = labels[train2] ~ ., data = loan2[train2])
pred_probs2 <- predict(fit2, loan2[-train2], type = "raw")
pred_percent2 <- pred_probs2 %>% apply(2, scales::percent, accuracy = 0.01)
labs2 <- colnames(pred_percent2)[max.col(pred_probs2)]
pred_percent2 %>%
    magrittr::set_rownames(labs2)

In [None]:
pred2 <- predict(fit2, loan2[-train2], type = "class")
identical(labs2, as.character(pred2))
result2 <- caret::confusionMatrix(pred2, labels[-train2])
result2

In [None]:
plot_ly(loan[-train2], 
        x = ~loan_condition,
        type = "histogram")
1-207/(1043+207)

In [None]:
loan_ <- copy(loan.dt)
a <- as.data.table(sapply(loan_, function(x) length(unique(x))))
b <- cbind(names(loan_),a) 
colnames(b)<- c("name","count")
loan_[,b[count==1,name]:=NULL]
loan_[,id:=NULL]
loan_[,c("home_ownership","income_category","term","purpose","interest_payments",
        "loan_condition_cat","grade","region"):=NULL]
loan_[,c("issue_d","final_d"):=NULL]
                          

In [None]:

loan_n <- loan_[,BBmisc::normalize(.SD, "range"), .SDcols = -1]
loan_n


In [None]:
set.seed(20)
loan_n1 <- loan_n[,-c("loan_condition")]
train <- loan_n[,sample(.I, .N*3/4)]

loan_train <- loan_n1[train]
loan_test <- loan_n1[-train]
loan_train_labels <- loan_n[train,loan_condition]
loan_test_labels <- loan_n[-train,loan_condition]

In [None]:
loan_test_pred <- class::knn(train = loan_train,
                            test = loan_test,
                            cl = loan_train_labels,
                            k = 2)

In [None]:
ct1 <- gmodels::CrossTable(x = loan_test_labels,
                   y = loan_test_pred,
                   prop.chisq = F)
ct1

In [None]:
# using base-r notation. to interpret, read from inside out (much harder)
sum(diag(ct1$prop.tbl))

# using the tidyverse piped notation. to interpret read from left to right (much easier)
ct1$prop.tbl %>% diag() %>% sum()

# we can further split the first column subset step using the functional syntax for operators:
ct1 %>% "[["("prop.tbl") %>% diag() %>% sum()

In [None]:
k_batch <- function(kval = 2)
{
    # run prediction model
    loan_test_pred1 <- class::knn(train = loan_train,
                            test = loan_test,
                            cl = loan_train_labels,
                            k = kval)
    
    # count unequal pairs and divide by test size
    error_rate <- sum(loan_test_labels != loan_test_pred1) / length(train)

    # report findings
    c(kval, error_rate)

}

# run the model for all k = 1 to 88
report <- t(sapply(1:88, k_batch))

# change column names
colnames(report)  <- c("k value", "Error rate")

# return the matrix object
report

In [None]:
# object should a data frame, not a matrix
df1 <- as.data.frame(report)

# create ggplot object with line and point geoms, point color and sizes and tooltip text
# note the vectorized "ifelse" function to create vectors of colors and sizes
gp <- ggplot2::ggplot(df1, aes(x = `k value`, y = `Error rate` )) +
geom_line(linetype = "dashed") +
geom_point(color = ifelse(df1[[2]] == min(df1[[2]]), "red", "blue"),
        size = ifelse(df1[[2]] == min(df1[[2]]), 6, 2),
        mapping = aes(text = paste("k value: ", df1[[1]], "\n", "incorrect: ", df1[[2]]))) +
        labs(x = "k value", y = "total incorrect")

# Convert to plotly object for interactive tooltip
plotly::ggplotly(gp, tooltip = c("text"))

In [None]:
sprintf("So, when the k value is %s, count of incorrect is at a minimum of %s",
        which.min(report[,2]),
        min(report[,2]))

In [None]:
loan_test_pred2 <- class::knn(train = loan_train,
                            test = loan_test,
                            cl = loan_train_labels,
                            k = which.min(report[,2]))
ct2 <- gmodels::CrossTable(x = loan_test_labels,
                   y = loan_test_pred2,
                   prop.chisq = F)
ct2

In [None]:
sum(diag(ct2$prop.tbl))