## SMAVA

Test exercise - Question 1

Data inspection

In [1]:
## weaves
## smava

getwd()

## load in packages
library(Hmisc)

library(ranger)
library(MASS)
library(tidyverse)
library(e1071)

library(rpart)
library(rpart.plot)
library(ipred)
library(mlbench)
library(pROC)
library(gbm)
library(dplyr)
library(caret)

library(doMC)

registerDoMC(cores = detectCores(all.tests = FALSE, logical = TRUE))

options(useFancyQuotes = TRUE)

Loading required package: lattice

Loading required package: survival

Loading required package: Formula

Loading required package: ggplot2


Attaching package: 'Hmisc'


The following objects are masked from 'package:base':

    format.pval, units


-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mtibble [39m 2.1.3     [32mv[39m [34mdplyr  [39m 0.8.3
[32mv[39m [34mtidyr  [39m 1.0.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.5.0
[32mv[39m [34mpurrr  [39m 0.3.3     

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31mx[39m [34mdplyr[39m::[32mselect()[39m    masks [34mMASS[39m::select()
[31mx[39m [34mdplyr[39m::[32msrc()[39m       ma

In [2]:
## Data sets

load("bak/in/train.rdata")
load("bak/in/test.rdata")

train0 <- data.frame(train) # a local copy.

# Carry some configuration data
smava0 <- list()

# See some summaries.
sapply(train, class)
sapply(train, summary)

## Order to highlight NA pattern.

## The outcome variable accepted is balanced. So need to upsample.

## customerNumber has 15000 unique.

## NA's on interestRate, and x2. x2 is specific to customer.

## interestRate is for regression analysis later

fnas <- function(x) sum(as.integer(is.na(train[[x]])))
fnas("x2")
fnas("interestRate")

## interestRate is only given for accepted
train[(train$accepted == "NO") & !is.na(train$interestRate), c("accepted", "interestRate")]

$customerNumber
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      2    7551   15105   15054   22518   29997 

$bank
  B1  B10  B11  B12  B13  B14  B15  B16  B17  B18  B19   B2  B20  B21  B22  B23 
4603 4495 4424 4509 4479 4492 4418 4547 4565 4402 4410 4541 4444 4496 4504 4522 
 B24  B25  B26  B27  B28  B29   B3  B30   B4   B5   B6   B7   B8   B9 
4518 4513 4569 4473 4565 4444 4480 4463 4424 4508 4479 4542 4588 4552 

$x1
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.02433  0.49992  1.00460  1.65895  1.95799 75.79947 

$x2
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.01    0.97    2.71    8.17    7.36  910.95   40520 

$x3
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.35  166.00  199.11  199.34  232.74  382.41 

$x4
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-13.5489  -0.7509   1.9963   1.9581   4.6776  18.8855 

$x5
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-13.6414  -0.7914   1.9706   1.9340   4.6317  18.2838 

$x6
  A

accepted,interestRate
<fct>,<dbl>


In [17]:
## No obvious rule
## sapply(train[is.na(train$x2),], summary)
## sapply(train[!is.na(train$x2),], summary)

## x2 is blank for a type of customer.

train1 <- train[order(train$customerNumber, train$bank),]
smava0$accepted <- train1$accepted

## As a numeric for correlation
train1$outcomes <- (train1$accepted == "YES") * 1

idx <- as.vector(sapply(train1, class, USE.NAMES=FALSE) == "factor")
smava0$cs <- colnames(train1)[idx]

for(c in smava0$cs) {
  train1[[c]] <- as.numeric(train1[[c]])
}

train2 <- train1 %>% dplyr::select(customerNumber, x2) %>% 
group_by(customerNumber) %>% summarise(n = n(), na0 = sum(is.na(x2))) 

In [18]:
## And is all NA at all banks for a set of customers.

### Sanity check

## This is an empty set. For all those records where n is not equal to the number of nas in x2 for
## that customer.
train2[ (train2$n != train2$na0) && (train2$na0 > 0),]
## TODO
## I should check the bank hits too.
## NOTE
## It isn't easy to impute with this.

## capture those customers who have x2 at NA.

smava0$"null-customer" <- train2[train2$na0 > 0, "customerNumber"][["customerNumber"]]
length(smava0$"null-customer")

customerNumber,n,na0
<int>,<int>,<int>


In [23]:
## store some colnames sets.

col0 <- colnames(train1)
smava0$results <- train1[c("accepted", "interestRate")]
smava0$ctl <- train1[c("customerNumber", "bank")]
smava0$nullcols <- c("x2")
## outcomes comes in as a feature.
smava0$ft0 <- setdiff(col0, union(colnames(smava0$results), colnames(smava0$ctl)))
smava0$ft0 <- c(smava0$ft0, c("accepted", "bank"))

In [20]:
## Correlations dataset.
## The same record may fail or succeed depending on the bank.
## I haven't encoded the bank, because you would hope banks follow similar policies.
## So we may have records that are identical but fall once and succeed at another bank.
## This just for correlations.

## Let's add a boolean for when x2 is null and assign the mean of x2 as the value.
## Hmisc::aregimpute() doesn't converge for x2.

train1$x2na <- 0
train1[ is.na(train1$x2), "x2na" ] <- 1
smava0$x2impute <- as.vector(summary(train1$x2)['Mean'])
train1[ is.na(train1$x2), "x2" ] <- smava0$x2impute

In [7]:
## Pair-plot
train1p <- train1[, c(smava0$ft0, "x2na") ]

In [8]:
nm0.fspec <- paste("smava0", "pp", "-%03d.jpeg", sep ="")

jpeg(width=1024, height=768, filename= nm0.fspec)
plot(train1p)
dev.off()

In [9]:
## correlations

smava0$cor <- cor(train1p)

jpeg(filename=paste("smava0", "cc", "-%03d.jpeg", sep=""),
     width=1024, height=768)

corrplot::corrplot(smava0$cor, method="number", order="hclust")

dev.off()

ihcor <- findCorrelation(smava0$cor, cutoff = .75, verbose = FALSE)
smava0$hcor <- colnames(train1p)[ihcor]

## This is encouraging. x3 is -0.98 with x10 and x10 is -0.21 with outcomes.
## Let's discard x3 because it duplicates x10

## Near Zero Variables

nzv0 <- nearZeroVar(train1p, saveMetrics = TRUE, allowParallel=TRUE, freqCut =95/5, uniqueCut=10)
all(!nzv0$nzv)
all(!nzv0$zeroVar)

## So good distributions.

## Training Output (I)

In [15]:
head(train1)

customerNumber,bank,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,accepted,interestRate,outcomes,x2na
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2,6,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.516418,0.5990008,0.406155,2,3.31531,1,0
2,9,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.412266,0.5949548,0.406155,2,5.746954,1,0
2,15,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.325315,0.6043728,0.406155,1,,0,0
2,21,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.212415,0.5872142,0.406155,1,,0,0
2,22,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.022601,0.633176,0.406155,2,4.248454,1,0
2,24,0.9954692,3.688772,179.8099,2.332774,3.033258,1,803,5.172827,0.6448435,0.406155,2,3.54202,1,0


In [24]:
# Put the outcomes back 
train1["accepted"]<- smava0["accepted"]
save(train1, outcomes, smava0, file = "smava00.dat")

In [14]:
test1 <- data.frame(test)
test1$x2na <- 0
test1[ is.na(test1$x2), "x2na" ] <- 1
test1[ is.na(test1$x2), "x2" ] <- smava0$x2impute

test1n <- test1[, smava0$ft1]

In [14]:
## pre-process
df0 <- predict(smava0$pp, test1n)

testPred <- predict(smava0$gbm, df0)

predictions <- data.frame(test)
predictions$predictionAccepted <- testPred

save(predictions, file="predictions.rdata")

ERROR: Error in UseMethod("predict"): no applicable method for 'predict' applied to an object of class "NULL"


In [None]:
head(predictions)

## More transformations

spatialSign

In [None]:
spatialSign(rnorm(5))

spatialSign(matrix(rnorm(12), ncol = 3))

# should fail since the fifth column is a factor
try(spatialSign(iris), silent = TRUE)

trellis.par.set(caretTheme())

featurePlot(iris[,-5], iris[,5], "pairs")
featurePlot(spatialSign(scale(iris[,-5])), iris[,5], "pairs")

In [28]:
## Models

library(randomForest)
method1 <- "parRF"

In [29]:
mi <- getModelInfo(model = method, regex = FALSE)[[1]]
p0 <- data.frame(mi$parameters)
p0

parameter,class,label
<fct>,<fct>,<fct>
sigma,numeric,Sigma
C,numeric,Cost
