<a href="https://colab.research.google.com/github/ellicarvalho/MDC_unicamp/blob/main/T03_615.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Packages

In [None]:
# instalando pacotes
install.packages("caret", dependencies = TRUE)
install.packages("reshape2", dependencies = TRUE)
install.packages("ggplot2")
install.packages("rpart", dependencies = TRUE)
install.packages("rpart.plot")
install.packages("randomForest", dependencies = TRUE)
install.packages("ramify")


# carregando os pacotes
library(ramify)
library(caret)
library(reshape2)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(randomForest)
library(glue)

# configurando a seed
SEED = 42
set.seed(SEED)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘R.methodsS3’, ‘R.oo’, ‘R.utils’, ‘bitops’, ‘shape’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘httpuv’, ‘xtable’, ‘sourcetools’, ‘later’, ‘promises’, ‘R.cache’, ‘caTools’, ‘TH.data’, ‘profileModel’, ‘minqa’, ‘nloptr’, ‘RcppEigen’, ‘lazyeval’, ‘plotrix’, ‘diagram’, ‘lava’, ‘shiny’, ‘miniUI’, ‘styler’, ‘classInt’, ‘labelled’, ‘gplots’, ‘libcoin’, ‘matrixStats’, ‘multcomp’, ‘iterators’, ‘Rcpp’, ‘clock’, ‘gower’, ‘hardhat’, ‘timeDate’, ‘brglm’, ‘gtools’, ‘lme4’, ‘qvcalc’, ‘rex’, ‘Formula’, ‘plotmo’, ‘prodlim’, ‘combinat’, ‘questionr’, ‘ROCR’, ‘mvtnorm’, ‘modeltools’, ‘strucchange’, ‘coin’, ‘zoo’, ‘sandwich’, ‘ISwR’, ‘corpcor’, ‘ROSE’, ‘e1071’, ‘foreach’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’, ‘reshape2’, ‘BradleyTerry2’, ‘covr’, ‘Cubist’, ‘earth’, ‘ellipse’, ‘fastICA’, ‘gam’, ‘ipred’, ‘kernlab’, ‘klaR’, ‘mda’, ‘mlbe

# Funções Uteis

In [None]:
calculaMatrizConfusaoRelativa <- function(cm){
    cm_absolute = t(cm$table)
    cm_relative = cm_absolute
    cm_relative[1,] = round(cm_absolute[1,]/sum(cm_absolute[1,]), digits=2)
    cm_relative[2,] = round(cm_absolute[2,]/sum(cm_absolute[2,]), digits=2)
    return(cm_relative)
}

getHypothesis <- function(feature_names){

    hypothesis_string <- "hypothesis <- formula(Resultado ~ "
        for(i in 1:length(feature_names)){
            hypothesis_string <- paste(hypothesis_string, feature_names[i]," + ", sep = "")
        }
    hypothesis_string <- substr(hypothesis_string, 1, nchar(hypothesis_string)-3)
    hypothesis_string <- paste(hypothesis_string, ")")
    hypothesis <- eval(parse(text=hypothesis_string))
    return(hypothesis)
}

# Leitura e inspeção dos dados

In [None]:
data <- read.csv("/content/covid_analysis_train_val_sets.csv", header=TRUE, stringsAsFactors=TRUE)
test <- read.csv("/content/covid_analysis_test_set.csv", header=TRUE, stringsAsFactors=TRUE)

print("Dimensão dados:")
dim(data)
print("Dados na?")
any(is.na(data))

# Remove os elementos repetidos antes da divisao Treino/Validacao/Test
data <- unique(data)
print("Dimensão dados sem duplicados:")
dim(data)

# Train 80% / Val 20%
randomTrainIndexes <- sample(1:nrow(data), size=0.8*nrow(data))
dataTrain <- data[randomTrainIndexes, ]
dataVal  <- data[-randomTrainIndexes, ]

print("Dimensão dados de treino:")
dim(dataTrain)
print("Dimensão dados validação:")
dim(dataVal)

print("Quantidade de exemplos por classe:")
table(dataTrain$Resultado)

# Balanceamento
> ### Por undersamplig, inicialmente existem approximadamente 4 exemplos_negativos para cada positivo, após o balanceamento por undersampling o dado de treino possui 1.4 exemplos_negativos para cada positivo.

In [None]:
dataTrainPOSITIVO <- dataTrain[dataTrain$Resultado == "POSITIVO",]
dataTrainNEGATIVO <- dataTrain[dataTrain$Resultado == "NEGATIVO",]

dim(dataTrainPOSITIVO)
dim(dataTrainNEGATIVO)

randomNEGATIVOIdx <- sample(1:nrow(dataTrainNEGATIVO), size=1.4*nrow(dataTrainPOSITIVO))
subsamplingNEGATIVO <- dataTrainNEGATIVO[randomNEGATIVOIdx,]
dataTrain <- rbind(dataTrainPOSITIVO, subsamplingNEGATIVO)
table(dataTrain$Resultado)

# Baseline

In [None]:
feature_names <- colnames(dataTrain)[1:(ncol(dataTrain)-1)]
hypothesis <- getHypothesis(feature_names)

treeModel <- rpart(
    formula=hypothesis,
    data=dataTrain, method="class",
    control=rpart.control(
        minsplit=2,
        cp=0.0,
        xval=0
    ),
    parms=list(split="information")
    )

################## TRAIN
train_pred <- predict(treeModel, dataTrain, type="class")
cm <- confusionMatrix(
    data=as.factor(train_pred),
    reference = as.factor(dataTrain$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','BASELINE:'))
print(glue('>>> TRAIN: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TRAIN: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## VALIDATION
val_pred <- predict(treeModel, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference = as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','BASELINE:'))
print(glue('>>> VAL: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> VAL: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## TEST
test_pred <- predict(treeModel, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','BASELINE:'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

# Árvores com variação de profundidade e plot

In [None]:
########## ACC Vs Depth
number_of_depths = 20
accPerDepth <- data.frame(
    depth=numeric(number_of_depths),
    accTrain=numeric(number_of_depths),
    accVal=numeric(number_of_depths)
)

for (maxDepth in 1:number_of_depths){
    treeModel <- rpart(
        formula=hypothesis,
        data=dataTrain,
        method="class",
        control=rpart.control(minsplit=2, cp=0.0, maxdepth=maxDepth, xval=0),
    parms=list(split="information")
    )

    # Avaliando no conjunto de treino
    train_pred <- predict(treeModel, dataTrain, type="class")
    cm_train <- confusionMatrix(
        data=as.factor(train_pred),
        reference=as.factor(dataTrain$Resultado),
        positive='POSITIVO'
    )

    cm_relative_train <- calculaMatrizConfusaoRelativa(cm_train)
    acc_bal_train <- (cm_relative_train[1,1] + cm_relative_train[2,2])/2

    # Avaliando no conjunto de validacao
    val_pred <- predict(treeModel, dataVal, type="class")
    cm_val <- confusionMatrix(
        data=as.factor(val_pred),
        reference = as.factor(dataVal$Resultado),
        positive='POSITIVO'
    )

    cm_relative_val <- calculaMatrizConfusaoRelativa(cm_val)
    acc_bal_val <- (cm_relative_val[1,1] + cm_relative_val[2,2])/2

    accPerDepth[maxDepth,] = c(maxDepth, acc_bal_train, acc_bal_val)
}


# PLOT: ACC Balanceada X Profundidade da Árvore
options(repr.plot.width=12, repr.plot.height=5)
accPerDepth <- melt(accPerDepth, id="depth")

p <- ggplot(data=accPerDepth, aes(x=depth, y=value, colour=variable)) + geom_line() + geom_point()
p <- p + ggtitle("ACC Balanceada X Profundidade da Árvore") + ylab("ACC Balanceada") + scale_x_discrete(
                  name="Profundidade", limits=as.character(1:number_of_depths))
p + theme(
    plot.title=element_text(hjust=0.5),
    legend.position=c(0.5, 0.5),
    text=element_text(size=20),
    title = element_text(color = "grey20", size=18),
    panel.background = element_blank(),
    panel.grid.minor.y = element_line(size=1),
    panel.grid.major = element_line(colour="gray"),
    )

# Solução com tamanho ótimo

In [None]:
feature_names <- colnames(dataTrain)[1:(ncol(dataTrain)-1)]
hypothesis <- getHypothesis(feature_names)

treeModel <- rpart(
    formula=hypothesis,
    data=dataTrain, method="class",
    control=rpart.control(
        maxdepth=3, # regiao ótima de profundidade
        minsplit=2,
        cp=0.0,
        xval=0,
        ),
    parms=list(split="information")
    )

################## TRAIN
train_pred <- predict(treeModel, dataTrain, type="class")
cm <- confusionMatrix(
    data=as.factor(train_pred),
    reference = as.factor(dataTrain$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TAMANHO ÓTIMO:'))
print(glue('>>> TRAIN: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TRAIN: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## VALIDATION
val_pred <- predict(treeModel, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference = as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TAMANHO ÓTIMO:'))
print(glue('>>> VAL: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> VAL: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## TEST
test_pred <- predict(treeModel, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TAMANHO ÓTIMO:'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative

# Feature Selection
> #### Explorem pelo menos 2 possíveis subconjuntos de features (feature selection) para treinar duas (ou mais) árvores de decisão. Tomem o melhor modelo baseado na acurácia balanceada no conjunto de validação, e reportem a matriz de confusão relativa e a acurácia balanceada do no conjunto de teste

In [None]:
treeModel <- rpart(
    formula=hypothesis,
    data=dataTrain, method="class",
    control=rpart.control(minsplit=2, cp=0.0, xval=0),
    parms=list(split="information")
)

importance_per_feature <- treeModel$variable.importance
relative_importance <- importance_per_feature/sum(importance_per_feature)
print(relative_importance)

In [None]:
top_10_features   <- c('Dimeros.D..quantitativo','DHL','Hemoglobina','Ureia','Leucocitos','Neutrofilos','Hematocrito','Linfocitos....','Concentracao.de.Hemoglobina.Corpuscular', 'ctO2.arterial')
least_10_features <- c('Fracao.Imatura.de.Plaquetas','Volume.plaquetario.medio','Hormonio.Tiroestimulante', 'Sodio..sangue','Lactato..sangue' ,'T4.Livre','p50.arterial','Globulinas','Cloro..sangue','Potassio..sangue.total')
top10_least10     <- c(top_10_features, least_10_features)

## Árvore com top10 feature importance

In [None]:
hipotese_top10feat <- getHypothesis(top_10_features)

treeModel_top10 <- rpart(
    formula=hipotese_top10feat,
    data=dataTrain, method="class",
    control=rpart.control(
        minsplit=2,
        maxdepth=3, # regiao ótima de profundidade
        cp=0.0,
        xval=10
    ),
    parms=list(split="information")
)

################## TRAIN
train_pred <- predict(treeModel_top10, dataTrain, type="class")
cm <- confusionMatrix(
    data=as.factor(train_pred),
    reference = as.factor(dataTrain$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TOP10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TRAIN: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TRAIN: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## VALIDATION
val_pred <- predict(treeModel_top10, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference = as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TOP10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> VAL: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> VAL: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

################## TEST
test_pred <- predict(treeModel_top10, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TOP10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

## Árvore com as 10 features menos importantes

In [None]:
hipotese_least10feat <- getHypothesis(least_10_features)

treeModel_least10 <- rpart(
    formula=hipotese_least10feat,
    data=dataTrain, method="class",
    control=rpart.control(
        minsplit=2,
        maxdepth=2, # regiao ótima de profundidade
        cp=0.0,
        xval=0
    ),
    parms=list(split="information")
)

################## TRAIN
train_pred <- predict(treeModel_least10, dataTrain, type="class")
cm <- confusionMatrix(
    data=as.factor(train_pred),
    reference = as.factor(dataTrain$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TRAIN: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TRAIN: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## VALIDATION
val_pred <- predict(treeModel_least10, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference = as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> VAL: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> VAL: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

################## TEST
test_pred <- predict(treeModel_least10, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

## Árvore com as 10 mais e menos importantes

In [None]:
hipotese_top_least_10_feat <- getHypothesis(top10_least10)

treeModel_top_least_10 <- rpart(
    formula=hipotese_top_least_10_feat,
    data=dataTrain, method="class",
    control=rpart.control(
        minsplit=2,
        maxdepth=2, # regiao ótima de profundidade
        cp=0.0,
        xval=0
    ),
    parms=list(split="information")
)

################## TRAIN
train_pred <- predict(treeModel_top_least_10, dataTrain, type="class")
cm <- confusionMatrix(
    data=as.factor(train_pred),
    reference = as.factor(dataTrain$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TOP10_com_LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TRAIN: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TRAIN: Matriz de Confusão:','\n\n'))
print(format(cm_relative))


################## VALIDATION
val_pred <- predict(treeModel_top_least_10, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference = as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','TOP10_com_LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> VAL: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> VAL: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

################## TEST
test_pred <- predict(treeModel_top_least_10, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','LEAST10 Features (TAMANHO ÓTIMO):'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

## Conclusão do Feature Selection
### melhor resultado top10_features

# Random Forest
> #### Treinem várias florestas aleatórias variando o número de árvores. Plotem a acurácia balanceada no conjunto de treinamento e validação variando o número de árvores geradas. Identifiquem as regiões de underfitting, ponto ótimo e verfitting. Reportem também a matriz de confusão relativa e a acurácia balanceada no teste para a floresta com o melhor número de árvores.

In [None]:
# Treina uma Floresta Aleatoria
rfModel <- randomForest(
    formula=hypothesis,
    data=dataTrain,
    ntree=100,
    mtry=7
    )

# Plotando o erro para cada classe a para OOB.
layout(matrix(c(1,2),nrow=1), width=c(4,1))
par(mar=c(5,4,4,0)) # Sem margem no lado direito
plot(rfModel, log="y")
par(mar=c(5,0,4,2)) # Sem margem do lado esquerdo
plot(c(0,1),type="n", axes=F, xlab="", ylab="")
legend("top", colnames(rfModel$err.rate),col=1:4,cex=0.8,fill=1:4)

val_pred <- predict(treeModel_top_least_10, dataVal, type="class")
cm <- confusionMatrix(
    data=as.factor(val_pred),
    reference=as.factor(dataVal$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','Random Forest:'))
print(glue('>>> Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> Matriz de Confusão:','\n\n'))
print(format(cm_relative))

In [None]:
# Vamos verificar agora como as acuracias de treinamento e de validcao variam com o numero de arvores na floresta aleatoria
nTreeList = c(1, 5, 10, 25, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500)
accPerNTrees <- data.frame(ntree=numeric(length(nTreeList)),
                           accTrain=numeric(length(nTreeList)),
                           accVal=numeric(length(nTreeList)))


for (i in 1:length(nTreeList)){
    rfModel <- randomForest(
        formula=hypothesis,
        data=dataTrain,
        ntree=nTreeList[i],
        mtry=7
        )

    # Avaliando no conjunto de treino
    train_pred <- predict(rfModel, dataTrain, type="class")
    cm_train <- confusionMatrix(data = as.factor(train_pred),
                                reference = as.factor(dataTrain$Resultado),
                                positive='POSITIVO')

    cm_relative_train <- calculaMatrizConfusaoRelativa(cm_train)
    acc_bal_train <- (cm_relative_train[1,1] + cm_relative_train[2,2])/2

    # Avaliando no conjunto de validacao
    val_pred <- predict(rfModel, dataVal, type="class")
    cm_val <- confusionMatrix(data = as.factor(val_pred),
                              reference = as.factor(dataVal$Resultado),
                              positive='POSITIVO')

    cm_relative_val <- calculaMatrizConfusaoRelativa(cm_val)
    acc_bal_val <- (cm_relative_val[1,1] + cm_relative_val[2,2])/2

    accPerNTrees[i,] = c(nTreeList[i],
                         acc_bal_train,
                         acc_bal_val)
}

# PLOT
options(repr.plot.width=12, repr.plot.height=5)

accPerNTrees <- melt(accPerNTrees, id="ntree")  # convert to long format
p <- ggplot(data=accPerNTrees, aes(x=ntree, y=value, colour=variable)) + geom_line() + geom_point()
p <- p + ggtitle("Random Forest: ACC por número de Árvores") + ylab("ACC Balanceada") + xlab("Número de Árvores")
p + theme(
    plot.title=element_text(hjust=0.5),
    legend.position=c(0.5, 0.5),
    text=element_text(size=20),
    title = element_text(color = "grey20", size=18),
    panel.background = element_blank(),
    panel.grid.minor.y = element_line(size=1),
    panel.grid.major = element_line(colour="gray"),
    )

# Avaliando RF no conjunto de teste

In [None]:
# BEST MODEL

rfModel_best <- randomForest(
        formula=hypothesis,
        data=dataTrain,
        ntree=150,
        mtry=7
        )


################## TEST
test_pred <- predict(rfModel_best, test, type="class")
cm <- confusionMatrix(
    data=as.factor(test_pred),
    reference = as.factor(test$Resultado),
    positive='POSITIVO'
)

cm_relative <- calculaMatrizConfusaoRelativa(cm)
acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

print(glue('\n\n','RF:'))
print(glue('>>> TEST: Acc balanceada: {acc_bal}', '\n\n'))
print(glue('>>> TEST: Matriz de Confusão:','\n\n'))
print(format(cm_relative))

# RF balanceada
## Lendo novamente os dados de treino_val

In [None]:
data <- read.csv("/content/covid_analysis_train_val_sets.csv", header=TRUE, stringsAsFactors=TRUE)

data <- unique(data)

# Train 80% / Val 20%
randomTrainIndexes <- sample(1:nrow(data), size=0.8*nrow(data))
dataTrain <- data[randomTrainIndexes, ]
dataVal  <- data[-randomTrainIndexes, ]

# Random Forest modificada considerando balanceamento por arvore

In [None]:
getRandomForestResults <- function(ntree, m, trainSet, valSet){

    # Seleciona os exemplos das classes positivas e negativas
    dataNeg <- trainSet[trainSet$Resultado == "NEGATIVO",]
    dataPos <- trainSet[trainSet$Resultado == "POSITIVO",]

    lowest_samples <- min(dim(dataNeg)[1], dim(dataPos)[1])

    # Matriz de tamano N x M inicializada com zeros. Em que N eh o numero de exemplos no
    # conjunto de validacao e M eh o numero de arvores que teremos no Ensemble. Cada coluna
    # tera os valores preditos por cada arvore no Ensemble.
    valPredictedClasses <- matrix(0, nrow = nrow(valSet), ncol = ntree)

    for(i in 1:ntree){
        nsamples <- round(runif(1, min=0.85, max=1.0)*lowest_samples)

        # Seleciona, com reposicao (ja que o Bagging faz parte da Random Forest),
        # os indices da classe negativa
        NoIdx <- sample(1:nrow(dataNeg), nsamples, replace = TRUE)

        # Seleciona, com reposicao, os indices da classe positiva
        YesIdx <- sample(1:nrow(dataPos), nsamples, replace = TRUE)

        # Selecionamos aleatoriamente um subconjunto das features originais (desconsiderando o target).
        # Ja que, cada arvore  na random forest, eh treinada com um subconjunto dos dados
        # tomados com reposicao (duas linha de comando a cima) e um  subconjunto das features.
        featuresIdx <- sample(1:(ncol(trainSet)-1), m, replace = FALSE)

        # Como desconsideramos o target anteriormente, temos que adiciona-lo de volta para o modelo treinar
        featuresIdx <- c(featuresIdx, ncol(trainSet))

        # Cria-se o conjunto de treino baseado na selecao de exemplos e features das linhas anteriores
        subsetDataTrain <- rbind(dataNeg[NoIdx,featuresIdx],
                                dataPos[YesIdx,featuresIdx])

        treeModel <- rpart(formula=Resultado ~ .,
                        data=subsetDataTrain, method="class",
                        control=rpart.control(minsplit=2, cp=0.0, xval = 0),
                        parms= list(split="information"))

        valPreds <- predict(treeModel, valSet)

        # Como vamos contar os votos, precisamos transformar as predicoes em numeros. Assim o
        # "valPreds" anterior eh uma matriz N x 2 em que N eh o numero de exemplos no conjunto
        # de validacao e 2 eh o numero de classes ("POSITIVO" ou "NEGATIVO"). Assim, se a predicao for "NEGATIVO"
        # vamos colocar valor 0, mas se for "POSITIVO" vamos colocar  valor 1. A linha abaixo realiza esta operacao.
        valClasses <- argmax(valPreds) - 1
        valPredictedClasses[,i] <- valClasses

    }

    # Contagem de votos. Por exemplo, se tivermos 5 arvores, podemos ter  a seguinte
    # predicao: 1 0 0 1 0. A soma resulta em 2. Assim, a proporcao eh 2/5 = 0.4.
    # Ja que 0.4 < 0.5, entao a classe mais votada eh zero.
    votes <- rowSums(valPredictedClasses)/ntree
    votes[votes >= 0.5] <- 'POSITIVO'
    votes[votes < 0.5]  <- 'NEGATIVO'

    cm <- confusionMatrix(data=as.factor(votes),
                        reference = as.factor(valSet$Resultado),
                        positive='POSITIVO')


    cm_relative <- calculaMatrizConfusaoRelativa(cm)
    acc_bal <- (cm_relative[1,1] + cm_relative[2,2])/2

    print(glue('\n\n', 'Resultado para {m} atributos:', '\n\n'))
    print(glue('Acc balanceada: {acc_bal}', '\n\n'))
    print(glue('Matriz de Confusão:'))
    print(format(cm_relative))
    accValRandomForest <- c(ntree-1)

    return(accValRandomForest)
    }

# Variando o número de árvores
number_of_trees <- c(10,20,50,100,150,200)

# Como existem 100 features vamos criarum loop de 10 em 10 features.
for (n in number_of_trees) {
    print(glue('\n\n','Número de Árvores:', n))
    for (i in seq(10, 100, by=10)) {
        accValRandomForest <- getRandomForestResults(n, i, dataTrain, dataVal)
    }
}

# Avaliação no TestSet

In [None]:
accValRandomForest <- getRandomForestResults(60, 70, dataTrain, test)