In [1]:
# Carrega Bibliotecas

library(scales)
library(dplyr)
library(ggplot2)
library(daltoolbox)
library(RColorBrewer)
library(GGally)
library(reshape)
library(corrplot)
library(WVPlots)
library(aplpack)
library(gridExtra)
library(tidyr)
library(factoextra)
library(dbscan)
library(fpc)

# Configuração de Cores
colors <- brewer.pal(4, 'Set1')

# Configuração da Fonte
font <- theme(text = element_text(size=16))

# Carrega Dataset
load("/home/data/malaria/malaria.RData")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 


Attaching package: ‘daltoolbox’


The following object is masked from ‘package:base’:

    transform


Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2


Attaching package: ‘reshape’


The following object is masked from ‘package:dplyr’:

    rename


corrplot 0.92 loaded

Loading required package: wrapr


Attaching package: ‘wrapr’


The following object is masked from ‘package:dplyr’:

    coalesce


“no DISPLAY variable so Tk is not available”

Attaching package: ‘gridExtra’


The following object is masked from ‘package:dplyr’:

    combine



Attaching package: ‘tidyr’


The following objects are masked from ‘package:wrapr’:

    pack, unpack


The follow

In [2]:
# explorando os dados
# todos os atributos são categóricos

glimpse(data)
print(t(sapply(data, class)))

Rows: 22,923,977
Columns: 40
$ infection.county      [3m[90m<fct>[39m[23m 110001, 110001, 110001, 110001, 110001, 110001, …
$ home.county           [3m[90m<fct>[39m[23m 110001, 110001, 110001, 110001, 110001, 110001, …
$ notification.county   [3m[90m<fct>[39m[23m 110001, 110001, 110001, 110001, 110001, 110001, …
$ qty.parasites         [3m[90m<dbl>[39m[23m 501100, NA, NA, NA, 3, 305500, NA, NA, NA, 50110…
$ scheme                [3m[90m<fct>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 1, 1, 1,…
$ notification.hr       [3m[90m<fct>[39m[23m ZONA DA MATA, ZONA DA MATA, ZONA DA MATA, ZONA D…
$ home.hr               [3m[90m<fct>[39m[23m ZONA DA MATA, ZONA DA MATA, ZONA DA MATA, ZONA D…
$ infection.hr          [3m[90m<fct>[39m[23m ZONA DA MATA, ZONA DA MATA, ZONA DA MATA, ZONA D…
$ exam.type             [3m[90m<fct>[39m[23m thick and thin blood smears, thick and thin bloo…
$ exam.result           [3m[90m<fct>[39m[23m vivax, vivax, vivax, vivax, viv

In [4]:
# verificar se existe alguma relacao para os resultados vivax e falciparum em relação aos dados demográficos

# pré-processamento
# selecionando os resultados vivax e falciparum e os atributos demográficos

ds1 <- data %>% filter(exam.result=="vivax" | exam.result == "falciparum")
ds1 <- ds1 %>% select(exam.result,gender,age,race,occupation,education.level,pregnancy,autochthonous.case)

# preparando dataset
# separando amostras de treino e teste

slevels <- levels(ds1$exam.result)
set.seed(1)
sr <- sample_random()
sr <- train_test(sr, ds1)
ds1_train <- sr$train
ds1_test <- sr$test

# juntando dados em uma tabela para melhor vizualização da distribuição

tbl <- rbind(table(ds1[,"exam.result"]), 
             table(ds1_train[,"exam.result"]), 
             table(ds1_test[,"exam.result"]))
rownames(tbl) <- c("dataset", "training", "test")
head(tbl)

Unnamed: 0,negative,falciparum,F+FG,vivax,F+V,V+FG,FG,malariae,F+M,ovale,non falciparum
dataset,0,262440,0,2365980,0,0,0,0,0,0,0
training,0,210093,0,1892643,0,0,0,0,0,0,0
test,0,52347,0,473337,0,0,0,0,0,0,0


In [5]:
# Modelo de Treino
model <- cla_dtree("exam.result", slevels)
model <- fit(model, ds1_train)
train_prediction <- predict(model, ds1_train)

In [6]:
# Modelo de Avaliação
ds1_train_predictand <- adjust_class_label(ds1_train[,"exam.result"])
train_eval <- evaluate(model, ds1_train_predictand, train_prediction)
print(train_eval$metrics)

   accuracy        f1 sensitivity specificity precision    recall
1 0.9818338 0.9000859   0.9000859   0.9900086 0.9000859 0.9000859


In [7]:
# Realização de Testes
test_prediction <- predict(model, ds1_test)
  
ds1_test_predictand <- adjust_class_label(ds1_test[,"exam.result"])
test_eval <- evaluate(model, ds1_test_predictand, test_prediction)
print(test_eval$metrics)

   accuracy        f1 sensitivity specificity precision    recall
1 0.9818948 0.9004212   0.9004212   0.9900421 0.9004212 0.9004212
