<div >
<img src = "../banner.jpg" />
</div>

<a target="_blank" href="https://colab.research.google.com/github/ignaciomsarmiento//BDML_202302/blob/main/Lecture08/Notebook_SS08_ROC.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


# Missclasification

To work through the steps of probability-based classification, we’ll use a real dataset on loans and credit from a set of local lenders in Germany (taken from the UC Irvine Machine Learning Repository and cleaned for our purposes). 

Credit scoring is a classic problem of classification, and it remains one of the big application domains for ML: use previous loan results (default versus payment) to train a model that can predict the performance of potential new loans.

\begin{align}
Default=f(x) + u
\end{align}

where $Default=I(Default=1)$



In [None]:
#Cargar librerías 
require("pacman")
p_load("tidyverse")


In [None]:
#Leer los datos 
credit <- readRDS(url("https://github.com/ignaciomsarmiento/datasets/blob/main/credit_class.rds?raw=true"))
#mutacion de factores
credit<-credit %>% mutate(Default=factor(Default,levels=c(1,0),labels=c("Si","No")),
                          history=factor(history,levels=c("good","poor","terrible"),labels=c("buena","mala","terrible")),
                          foreign=factor(foreign,levels=c("foreign","german"),labels=c("extranjero","aleman")),
                          purpose=factor(purpose,levels=c("newcar","usedcar","goods/repair","edu", "biz" ),labels=c("auto_nuevo","auto_usado","bienes","educacion","negocios")))         

head(credit)

In [None]:
prop.table(table(credit$Default))

## Preparar la base
### División de la muestra

- El objetivo es predecir bien fuera de muestra

- No queremos sobreajustar a la muestra
  
- Vamos a definir 2 bases

  - Muestra de entrenamiento: vamos a estimar los modelos, buscar parámetros, etc.
  
  -  Muestra de prueba que solo vamos a usar para evaluar los modelos


In [None]:
## First, split the training set 
set.seed(1011)
p_load("caret")

inTrain <- createDataPartition(
  y = credit$Default,## La variable dependiente u objetivo 
  p = .7, ## Usamos 70%  de los datos en el conjunto de entrenamiento 
  list = FALSE)


train <- credit[ inTrain,]
test  <- credit[-inTrain,]

## Accuracy

In [None]:
ctrl<- trainControl(method = "cv",
                    number = 5,
                    classProbs = TRUE,
                    savePredictions = T)


In [None]:
set.seed(123)

class_ranger <- train(
    Default~duration+amount+installment+age+history + purpose+foreign+rent,
    data=train,
    metric = "Accuracy",
    method = "ranger",
    trControl = ctrl,
    tuneGrid=expand.grid(
              mtry = c(1,2,3,4,5,6,7,8),
              splitrule = "gini",
               min.node.size = c(25,50,150,200,250))
)


In [None]:
class_ranger

In [None]:
predictSample <- train   %>% 
    mutate(hat_default = predict(class_ranger, newdata = train, type = "raw")    ## predicted class labels
)  %>% select(Default,hat_default)

head(predictSample)

In [None]:
confusionMatrix(data = predictSample$hat_default, reference = predictSample$Default)

In [None]:
# Accuracy
mean(predictSample$Default==predictSample$hat_default)

In [None]:
predictTest <- data.frame(
  Default = test$Default,                                    ## observed class labels
  hat_default = predict(class_ranger, newdata = test, type = "raw")    ## predicted class labels
)


In [None]:
confusionMatrix(data = predictTest$hat_default, reference = predictTest$Default)

## True Positive Rate


Podemos tratar te maximizar el TPR


\begin{align}
     True\,Positive\,Rate=Sensitivity&=\frac{True \,\,Positives}{Positives} 
  \end{align}
  
  \begin{align}
        Specificity&=\frac{True\,\,Negatives}{Negatives} \nonumber 
  \end{align}
  
\begin{align}
     False\,Positive\,Rate= 1-Specificity&=\frac{False\,\,Positives}{Negatives} \nonumber 
  \end{align}




In [None]:

ctrl2<- trainControl(method = "cv",
                     number = 5,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     savePredictions = T)


In [None]:
set.seed(123)

class_ranger_sens <- train(
    Default~duration+amount+installment+age+history + purpose+foreign+rent,
    data=train,
    metric = "Sens",
    method = "ranger",
    trControl = ctrl2,
    tuneGrid=expand.grid(
              mtry = c(1,2,3,4,5,6,7,8),
              splitrule = "gini",
               min.node.size = c(25,50,150,200,250))
)

In [None]:
class_ranger_sens

In [None]:
predictTest<- test   %>% 
    mutate(hat_default_sens = predict(class_ranger_sens, newdata = test, type = "raw")    ## predicted class labels
)  %>% select(Default,hat_default_sens)


In [None]:
confusionMatrix(data = predictTest$hat_default, reference = predictTest$Default)


\begin{align}
     True\,Positive\,Rate=Sensitivity&=\frac{True \,\,Positives}{Positives} 
  \end{align}
  
  \begin{align}
        Specificity&=\frac{True\,\,Negatives}{Negatives} \nonumber 
  \end{align}
  
\begin{align}
     False\,Positive\,Rate= 1-Specificity&=\frac{False\,\,Positives}{Negatives} \nonumber 
  \end{align}

## ROC

In [None]:
set.seed(123)

class_ranger_ROC <- train(
    Default~duration+amount+installment+age+history + purpose+foreign+rent,
    data=train,
    metric = "ROC",
     method = "ranger",
    trControl = ctrl2,
    tuneGrid=expand.grid(
              mtry = c(1,2,3,4,5,6,7,8),
              splitrule = "gini",
               min.node.size = c(25,50,150,200,250))
)

In [None]:
class_ranger_ROC

In [None]:
predictTest <- predictTest  %>% 
    mutate(class_ROC = predict(class_ranger_ROC, newdata = test, type = "raw"), # predicted class labels
           p_hat_ROC=predict(class_ranger_ROC, newdata = test, type = "prob")$Si,         ## predicted class probabilities#
           Default_num=ifelse(Default=="No",0,1)
)

head(predictTest)

In [None]:
confusionMatrix(data = predictTest$class_ROC, reference = predictTest$Default)

In [None]:
p_load("pROC")
rfROC <- roc(predictTest$Default, predictTest$p_hat_ROC, levels = rev(levels(predictTest$Default)))
rfROC

In [None]:
plot(rfROC)

In [None]:
FPR<-mean(predictTest$class_ROC[predictTest$Default=="No"]=="No")
TPR<-mean(predictTest$class_ROC[predictTest$Default=="Si"]=="Si")

In [None]:
plot(rfROC)
points(x= FPR, 
       y=TPR, 
       cex=4, pch=20, col='red') 

In [None]:
predictTest <- predictTest  %>% 
    mutate(class_ROC_2 = factor(ifelse(p_hat_ROC>.2,"Si","No"),levels=c("Si","No"))
)

In [None]:
confusionMatrix(data = predictTest$class_ROC_2, reference = predictTest$Default)

In [None]:
FPR_2<-mean(predictTest$class_ROC_2[predictTest$Default=="No"]=="No")
TPR_2<-mean(predictTest$class_ROC_2[predictTest$Default=="Si"]=="Si")


In [None]:
plot(rfROC, print.auc=TRUE,legacy.axes=TRUE)
## our .5 rule cutoff
points(x= FPR, 
       y=TPR, 
       cex=4, pch=20, col='red') 
## A .2 rule cutoff
points(x= FPR_2, 
       y=TPR_2, 
       cex=4, pch=20, col='blue') 
legend("bottomright",fill=c("red","blue"), legend=c("p=1/2","p=1/5"),bty="n",title="cutoff")

### Closest to top left cutoff

Otro enfoque es encontrar el punto en la curva ROC que está más cerca (es decir, la distancia más corta) al modelo perfecto (con 100\% de sensibilidad y 100\% de especificidad), que está asociado con la esquina superior izquierda de la gráfica.

In [None]:
rfThresh <- coords(rfROC, x = "best", best.method = "closest.topleft")
rfThresh

In [None]:
predictTest <- predictTest  %>% 
    mutate(class_ROC_Thresh = factor(ifelse(p_hat_ROC>rfThresh$threshold,"Si","No"),levels=c("Si","No"))
)

In [None]:
confusionMatrix(data = predictTest$class_ROC_Thresh, reference = predictTest$Default)

In [None]:
FPR_3<-mean(predictTest$class_ROC_Thresh[predictTest$Default=="No"]=="No")
TPR_3<-mean(predictTest$class_ROC_Thresh[predictTest$Default=="Si"]=="Si")


In [None]:
plot(rfROC, print.auc=TRUE,legacy.axes=TRUE)
## our .5 rule cutoff
points(x= FPR, 
       y=TPR, 
       cex=4, pch=20, col='red') 
## A optimal threshold
points(x= FPR_3, 
       y=TPR_3, 
       cex=4, pch=20, col='green') 
legend("bottomright",fill=c("red","green"), legend=c("p=1/2","p=Top Left"),bty="n",title="cutoff")