# Data Reading

In [None]:
# Reading in data
trainingBin <- read.csv("insurance_t_bin.csv") 
training <- read.csv("insurance_t.csv") 

validationBin <- read.csv("insurance_v_bin.csv") 
validation <- read.csv("insurance_v.csv") 

# Fixing Separations and NAs
trainingBin <- trainingBin %>% mutate(across(everything(), ~ as.character(.x))) %>% 
  mutate(across(everything(), ~ replace_na(.x,"M"))) %>% 
  mutate(across(everything(), ~ as.factor(.x)))

validationBin <- validationBin %>% mutate(across(everything(), ~ as.character(.x))) %>% 
  mutate(across(everything(), ~ replace_na(.x,"M"))) %>% 
  mutate(across(everything(), ~ as.factor(.x)))

# Old Logistic Regression Model

In [None]:
finalModel <- glm(INS ~ NSF + MTG + INV + ILSBAL_BIN + IRA + DDA + TELLER_BIN + CC + ATMAMT_BIN + CHECKS_BIN + MMBAL_BIN + CDBAL_BIN + DDABAL_BIN + SAVBAL_BIN + DDA:IRA,family = binomial(link = "logit"),data = trainingBin)

# Decision Tree Models

In [None]:
# Making a large tree to prune later. The values I selected are what I came to after playing around with various models.

# Only LR variables
lrTree <- rpart(INS ~ NSF + MTG + INV + ILSBAL + IRA + DDA + TELLER + CC + ATMAMT + CHECKS + MMBAL + CDBAL + DDABAL + SAVBAL, data=training, method='class',parms = list(split="gini"),
                    control = rpart.control(minsplit = 30, cp = .001, maxdepth = 6))
# All variables
bigTree <- rpart(INS ~ ., data=training, method='class',parms = list(split="gini"),
                       control = rpart.control(minsplit = 30, cp = .001, maxdepth = 6))

## Pruning

### Subset Variable Model

In [None]:
printcp(lrTree)

## 
## Classification tree:
## rpart(formula = INS ~ NSF + MTG + INV + ILSBAL + IRA + DDA + 
##     TELLER + CC + ATMAMT + CHECKS + MMBAL + CDBAL + DDABAL + 
##     SAVBAL, data = training, method = "class", parms = list(split = "gini"), 
##     control = rpart.control(minsplit = 30, cp = 0.001, maxdepth = 6))
## 
## Variables actually used in tree construction:
##  [1] ATMAMT CDBAL  CHECKS DDA    DDABAL INV    IRA    MMBAL  MTG    SAVBAL TELLER
## 
## Root node error: 2918/8495 = 0.3435
## 
## n= 8495 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.1329678      0   1.00000 1.00000 0.014999
## 2  0.0277587      1   0.86703 0.87320 0.014474
## 3  0.0099383      2   0.83927 0.83790 0.014300
## 4  0.0056546      5   0.80946 0.82762 0.014248
## 5  0.0054832     10   0.78033 0.82351 0.014226
## 6  0.0049692     11   0.77485 0.81905 0.014203
## 7  0.0035984     13   0.76491 0.81254 0.014168
## 8  0.0032557     15   0.75771 0.80672 0.014137
## 9  0.0027416     17   0.75120 0.80158 0.014109
## 10 0.0023989     18   0.74846 0.80363 0.014120
## 11 0.0020562     19   0.74606 0.80329 0.014118
## 12 0.0017135     23   0.73783 0.79952 0.014098
## 13 0.0010281     24   0.73612 0.79644 0.014081
## 14 0.0010000     28   0.73201 0.79507 0.014073

Only want to include first 6 layers based on oneSE

In [None]:
lrTree <- prune(lrTree,cp=0.0049692)

### Full Variable Model Tree

In [None]:
printcp(bigTree)

## 
## Classification tree:
## rpart(formula = INS ~ ., data = training, method = "class", parms = list(split = "gini"), 
##     control = rpart.control(minsplit = 30, cp = 0.001, maxdepth = 6))
## 
## Variables actually used in tree construction:
##  [1] ACCTAGE ATMAMT  BRANCH  CDBAL   CHECKS  CRSCORE DDA     DDABAL  DEP     MM      SAVBAL 
## 
## Root node error: 2918/8495 = 0.3435
## 
## n= 8495 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.1329678      0   1.00000 1.00000 0.014999
## 2  0.0277587      1   0.86703 0.87320 0.014474
## 3  0.0119945      2   0.83927 0.84030 0.014313
## 4  0.0111378      3   0.82728 0.82419 0.014230
## 5  0.0090816      5   0.80500 0.81905 0.014203
## 6  0.0065798      7   0.78684 0.81357 0.014174
## 7  0.0065113     12   0.75394 0.80295 0.014117
## 8  0.0034270     13   0.74743 0.79609 0.014079
## 9  0.0030843     14   0.74400 0.79232 0.014058
## 10 0.0020562     15   0.74092 0.79130 0.014052
## 11 0.0017135     17   0.73681 0.80055 0.014103
## 12 0.0015422     19   0.73338 0.79986 0.014100
## 13 0.0013708     22   0.72824 0.80329 0.014118
## 14 0.0010281     24   0.72550 0.80363 0.014120
## 15 0.0010000     25   0.72447 0.80295 0.014117

Only want first 10 layers

In [None]:
bigTree <- prune(bigTree,cp=0.0020562)

# Visualizing

## Subset Variable Model

![](HW2Final_files/figure-markdown_strict/unnamed-chunk-9-1.png)

## Full Variable Model

![](HW2Final_files/figure-markdown_strict/unnamed-chunk-10-1.png)

# Accuracy scores

## Predictions and Fitted Values

In [None]:
probLRTree <- predict(lrTree,validation,type = "prob")
probBigTree <- predict(bigTree,validation,type = "prob")

predLRTree <- predict(lrTree,validation,type = "class")
predBigTree <- predict(bigTree,validation,type = "class")

fittedLRTree <- predict(lrTree,training,type = "prob")
fittedBigTree <- predict(bigTree,training,type = "prob")

## Subset Model

In [None]:
lrAccuracy <- (length((which(predLRTree == validation$INS))) / nrow(validation))

lrAccuracy

## [1] 0.7123352

## Full Variable Model

In [None]:
bigAccuracy <- (length((which(predBigTree == validation$INS))) / nrow(validation))

bigAccuracy

## [1] 0.7306968

## Logistic Regression Accuracy

In [None]:
# Taken from logistic regression ROC curve
cutoff <-  0.2970672
pred <- predict(finalModel,validationBin,type = "response")
pred <- data.frame(pred = pred) %>% mutate(pred = if_else(pred > cutoff,1,0))
pred <- pred$pred

# Create accuracy vector
accDF <- data.frame(pred = pred, observed = validation$INS) %>% mutate(accuracy = if_else(pred == observed,1,0))

accuracy <- round(mean(accDF$accuracy),4)
# Accuracy 
accuracy

## [1] 0.702

Seeing as how the full variable tree outperforms both the logistic
regression, and the model built on a smaller set of variables. The
larger tree does not add too much complication