This notebook contains code designed for data inspection and filtering, aimed at creating classification datasets suitable for machine learning predictive tasks. For more comprehensive details about the datasets, including information about data collection methods, sources, and a dictionary of variables, please visit the project repository on GitHub: \url{https://github.com/gabivaleriano/HealthDataBR}.

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# read data
data <- read.csv('CHIKBR22.csv')

### Filter 1: remove duplicates

In [3]:
# check and remove duplicates

data %>% dim
data %>% unique %>% dim
data %>% unique -> data

In [4]:
# inspecting data distribution according to state

data %>% count(SG_UF_NOT)

SG_UF_NOT,n
<int>,<int>
11,502
12,207
13,470
14,427
15,1575
16,164
17,9139
21,3422
22,12280
23,80228


In [5]:
# visualize data

data %>% head
data %>% colnames

Unnamed: 0_level_0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,⋯,PLAQ_MENOR,CON_FHD,COMPLICA,NU_LOTE_I,TP_SISTEMA,NDUPLIC_N,DT_DIGITA,CS_FLXRET,FLXRECEBI,MIGRADO_W
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,⋯,<lgl>,<lgl>,<lgl>,<int>,<int>,<int>,<chr>,<lgl>,<lgl>,<lgl>
1,2,A92.0,2022-01-28,202204,2022,12,120033,1941,2000083,2022-01-23,⋯,,,,,2,,2022-03-18,,,
2,2,A92.0,2022-03-14,202211,2022,12,120040,1938,3689921,2022-03-01,⋯,,,,,2,,2022-03-21,,,
3,2,A92.0,2022-01-08,202201,2022,12,120033,1941,9542566,2022-01-07,⋯,,,,,2,,2022-03-16,,,
4,2,A92.0,2022-02-23,202208,2022,12,120033,1941,6568629,2022-02-16,⋯,,,,,2,,2022-03-18,,,
5,2,A92.0,2022-01-06,202201,2022,12,120040,1938,6119697,2022-01-05,⋯,,,,,2,,2022-01-11,,,
6,2,A92.0,2022-01-11,202202,2022,12,120040,1938,6119697,2022-01-10,⋯,,,,,2,,2022-01-19,,,


In [6]:
# inspect target features

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)
data %>% count(CLASSI_FIN)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,215573
2.0,102
3.0,278
4.0,33
9.0,8244
,45593


HOSPITALIZ,n
<int>,<int>
1.0,6534
2.0,150163
9.0,3858
,109268


CLASSI_FIN,n
<int>,<int>
1.0,3
2.0,1
5.0,95120
8.0,1
13.0,142693
,32005


In [7]:
# check how many regions, states and cities are present in the database

data %>% select(ID_REGIONA) %>% unique %>% dim
data %>% select(ID_MUNICIP) %>% unique %>% dim
data %>% select(SG_UF_NOT) %>% unique %>% dim
data %>% select(SG_UF) %>% unique %>% dim

data %>% select(ID_MN_RESI) %>% unique %>% dim
data %>% select(ID_PAIS) %>% unique %>% dim
data %>% select(ID_RG_RESI) %>% unique %>% dim

data %>% select(ID_UNIDADE) %>% unique %>% dim

data %>% select(UF) %>% unique %>% dim
data %>% select(MUNICIPIO) %>% unique %>% dim


### Delete columns

In [8]:
# keep only id related with the state and health unity, one micro and another macro localization

data %>% select(-c(ID_REGIONA, ID_MUNICIP, SG_UF, ID_PAIS, ID_RG_RESI, UF, MUNICIPIO)) -> data


# monstly NA values

data %>% count(TPAUTOCTO)
data %>% count(COPAISINF)
data %>% count(COMUNINF) 

data %>% count(NU_LOTE_I)
data %>% count(TP_SISTEMA)
data %>% count(CS_FLXRET)
data %>% count(FLXRECEBI)

data %>% count(MIGRADO_W)
data %>% count(NDUPLIC_N)
data %>% count(COMPLICA)
data %>% count(CON_FHD)

data %>% count(SOROTIPO)
data %>% count(HISTOPA_N)
data %>% count(IMUNOH_N)
data %>% count(DOENCA_TRA)
data %>% count(ALRM_HIPOT)

data %>% count(TP_NOT)
data %>% count(ID_AGRAVO)
data %>% count(NU_ANO)

data %>% select(-c(TPAUTOCTO, COPAISINF, COUFINF, COMUNINF)) -> data
data %>% select(-c(NU_LOTE_I, TP_SISTEMA, CS_FLXRET, FLXRECEBI, DT_DIGITA)) -> data
data %>% select(-c(MIGRADO_W, NDUPLIC_N, COMPLICA, CON_FHD)) -> data
data %>% select(-c(SOROTIPO, HISTOPA_N, IMUNOH_N, DOENCA_TRA, ALRM_HIPOT)) -> data
data %>% select(-c(TP_NOT, ID_AGRAVO, NU_ANO)) -> data

data %>% select(ALRM_PLAQ:ALRM_LIQ) %>% summary
data %>% select(GRAV_PULSO:GRAV_ORGAO) %>% summary
data %>% select(MANI_HEMOR:PLAQ_MENOR) %>% summary

data %>% select(-c(ALRM_PLAQ:PLAQ_MENOR)) -> data 

# NU_IDADE_N refers to age in a diferent format

data %>% select(-c(NU_IDADE_N)) -> data

# this dates usually are the same and refer to the date of digitalization

data %>% select(-c(DT_NOTIFIC, DT_ENCERRA, DT_INVEST, CRITERIO)) -> data

# we adopted the final classification to consider a positive case, remove test results and test dates

data %>% select(-c(DT_CHIK_S1:RESUL_PCR_)) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)
data %>% colnames

TPAUTOCTO,n
<lgl>,<int>
,269823


COPAISINF,n
<lgl>,<int>
,269823


COMUNINF,n
<lgl>,<int>
,269823


NU_LOTE_I,n
<int>,<int>
0.0,6
,269817


TP_SISTEMA,n
<int>,<int>
2.0,265294
,4529


CS_FLXRET,n
<lgl>,<int>
,269823


FLXRECEBI,n
<lgl>,<int>
,269823


MIGRADO_W,n
<lgl>,<int>
,269823


NDUPLIC_N,n
<int>,<int>
1.0,22
,269801


COMPLICA,n
<lgl>,<int>
,269823


CON_FHD,n
<lgl>,<int>
,269823


SOROTIPO,n
<lgl>,<int>
,269823


HISTOPA_N,n
<int>,<int>
4.0,198
,269625


IMUNOH_N,n
<int>,<int>
3.0,1
4.0,197
,269625


DOENCA_TRA,n
<int>,<int>
2.0,3
,269820


ALRM_HIPOT,n
<lgl>,<int>
,269823


TP_NOT,n
<int>,<int>
2,269823


ID_AGRAVO,n
<chr>,<int>
A92.0,269823


NU_ANO,n
<int>,<int>
2022,269823


 ALRM_PLAQ      ALRM_VOM       ALRM_SANG      ALRM_HEMAT     ALRM_ABDOM    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 ALRM_LETAR     ALRM_HEPAT     ALRM_LIQ      
 Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823   

 GRAV_PULSO     GRAV_CONV      GRAV_ENCH      GRAV_INSUF     GRAV_TAQUI    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 GRAV_EXTRE     GRAV_HIPOT     GRAV_HEMAT     GRAV_MELEN     GRAV_METRO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 GRAV_SANG      GRAV_AST       GRAV_MIOC      GRAV_CONSC     GRAV_ORGAO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   

 MANI_HEMOR     EPISTAXE       GENGIVO         METRO         PETEQUIAS     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 HEMATURA       SANGRAM         LACO_N        PLASMATICO     EVIDENCIA     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 PLAQ_MENOR    
 Mode:logical  
 NA's:269823   

EVOLUCAO,n
<int>,<int>
1.0,215573
2.0,102
3.0,278
4.0,33
9.0,8244
,45593


HOSPITALIZ,n
<int>,<int>
1.0,6534
2.0,150163
9.0,3858
,109268


### Filter 2: Remove if is na for state or health unit

In [9]:
data <- data %>%  filter(!is.na(SG_UF_NOT))
data <- data %>%  filter(!is.na(ID_UNIDADE))

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,215524
2.0,102
3.0,278
4.0,33
9.0,8244
,45587


HOSPITALIZ,n
<int>,<int>
1.0,6533
2.0,150155
9.0,3834
,109246


### Filter 3: filter only confirmed positive and negative cases

5-Descartado (negative)
13-Chikungunya (positive)

In [10]:
# patient diagnosed with chikungunya
data %>% count(CLASSI_FIN)

data %>% filter(CLASSI_FIN == 5 | CLASSI_FIN == 13) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CLASSI_FIN,n
<int>,<int>
1.0,3
2.0,1
5.0,95117
8.0,1
13.0,142647
,31999


EVOLUCAO,n
<int>,<int>
1.0,215448
2.0,102
3.0,276
4.0,26
9.0,8203
,13709


HOSPITALIZ,n
<int>,<int>
1.0,5873
2.0,143227
9.0,3732
,84932


### Filter 4: remove patients without year of birth

In [11]:
data %>% filter(!is.na(ANO_NASC)) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,213696
2.0,100
3.0,275
4.0,26
9.0,8150
,13598


HOSPITALIZ,n
<int>,<int>
1.0,5840
2.0,142193
9.0,3712
,84100


In [12]:
# create a new column with age

data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% slice(1:15)
data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% tail(12)
data %>% mutate(age = 2022 - ANO_NASC) -> data

age,n
<dbl>,<int>
0,1903
1,1144
2,1071
3,1417
4,1717
5,1771
6,1814
7,2016
8,2063
9,2286


Unnamed: 0_level_0,age,n
Unnamed: 0_level_1,<dbl>,<int>
105,104,3
106,105,5
107,106,2
108,107,4
109,108,2
110,109,2
111,110,1
112,111,1
113,114,1
114,115,1


### Remove columns

In [13]:
# ID_OCUPA_N seems to not be filled consistently
# it refers to area of activity and should be filled with the same value (not aplicable) to children 
# sem_not is the epidemiologic week of notification, we will adopt the epidemiologic week of the first symptoms

data %>% filter(age< 5) %>% count(ID_OCUPA_N) %>% slice(1:15)
data %>% select(-c(ID_OCUPA_N, SEM_NOT)) -> data

ID_OCUPA_N,n
<chr>,<int>
,5975
0.0,23
21210.0,1
123305.0,1
141405.0,2
142105.0,1
142115.0,1
142325.0,1
221105.0,2
223115.0,2


### Filter 5: remove patients older than 110 years and youger than 12

In [14]:
# remove patients older than 110 years old 

data %>% filter(age < 111) -> data

# remove patients younger than 16 years old

data %>% filter(age > 11) -> data

# remove the column year of birth

data %>% select(-ANO_NASC) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,194237
2.0,93
3.0,234
4.0,18
9.0,7113
,12006


HOSPITALIZ,n
<int>,<int>
1.0,4380
2.0,129535
9.0,3408
,76378


### Filter 6: remove patients without sex information

In [15]:
# remove patients without sex information

data %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') -> data
data %>% dim

# create a new column with numeric values

data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) %>% count(sex)
data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) -> data

# remove the former column
data %>% select(-CS_SEXO) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_SEXO,n
<chr>,<int>
,1
F,132627
I,217
M,80856


CS_SEXO,n
<chr>,<int>
F,132627
M,80856


sex,n
<dbl>,<int>
0,80856
1,132627


EVOLUCAO,n
<int>,<int>
1.0,194052
2.0,93
3.0,234
4.0,18
9.0,7107
,11979


HOSPITALIZ,n
<int>,<int>
1.0,4378
2.0,129474
9.0,3403
,76228


In [16]:
# check the column pregnancy for each sex
# transform na values in the column pregnancy in 0 (not pregnant)

data %>% count(CS_GESTANT)

data %>% filter(sex == 1) %>% count(CS_GESTANT)

data %>% filter(sex == 0) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) -> data

CS_GESTANT,n
<int>,<int>
1.0,883
2.0,1239
3.0,1142
4.0,422
5.0,92868
6.0,95860
9.0,21012
,57


CS_GESTANT,n
<int>,<int>
1.0,883
2.0,1239
3.0,1142
4.0,422
5.0,92868
6.0,15004
9.0,21012
,57


CS_GESTANT,n
<int>,<int>
6,80856


CS_GESTANT,n
<dbl>,<int>
0,57
1,883
2,1239
3,1142
4,422
5,92868
6,95860
9,21012


In [17]:
# transform the column pregnant in binary, orignally diferent values refer to difernt stages on pregnancy

data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) %>% count(pregnancy)


data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) -> data

data %>% select(-CS_GESTANT) -> data

pregnancy,n
<dbl>,<int>
0,209797
1,3686


### Filter 7: remove patients without race information

In [18]:
# remove unknown race, or na values

data %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_RACA,n
<int>,<int>
1.0,27226
2.0,7019
3.0,2514
4.0,146213
5.0,825
9.0,29685
,1


CS_RACA,n
<int>,<int>
1,27226
2,7019
3,2514
4,146213
5,825


EVOLUCAO,n
<int>,<int>
1.0,168492
2.0,88
3.0,221
4.0,16
9.0,5057
,9923


HOSPITALIZ,n
<int>,<int>
1.0,3925
2.0,115665
9.0,1993
,62214


In [19]:
# check year of schooling. 9 = unknown. 

data %>% count(CS_ESCOL_N)

data$CS_ESCOL_N[data$CS_ESCOL_N == 9] <- NA

data %>% count(CS_ESCOL_N)

CS_ESCOL_N,n
<int>,<int>
0.0,2329
1.0,7373
2.0,4304
3.0,10739
4.0,7783
5.0,10039
6.0,31831
7.0,3095
8.0,8623
9.0,48505


CS_ESCOL_N,n
<int>,<int>
0.0,2329
1.0,7373
2.0,4304
3.0,10739
4.0,7783
5.0,10039
6.0,31831
7.0,3095
8.0,8623
,97681


In [20]:
# how many groups by age, race, city and sex
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% n_groups

# create a new column with the average
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na or 9 change by the average

data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

# if is still NA consider groups of age, race and sex

# create a new column with the average
data %>% group_by(age, CS_RACA, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,2766
1.0,8640
2.0,6154
3.0,14592
4.0,17446
5.0,26715
6.0,49718
7.0,5311
8.0,10337
,42118


CS_ESCOL_N,n
<dbl>,<int>
0.0,2806
1.0,8802
2.0,8101
3.0,20021
4.0,26273
5.0,42772
6.0,59223
7.0,5351
8.0,10344
,104


### Filter 8: remove patients without schooling information (even after inputation)

In [21]:
# if is still na, drop it

data %>% count(CS_ESCOL_N)

data %>% filter(!(is.na(CS_ESCOL_N))) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

CS_ESCOL_N,n
<dbl>,<int>
0.0,2806
1.0,8802
2.0,8101
3.0,20021
4.0,26273
5.0,42772
6.0,59223
7.0,5351
8.0,10344
,104


EVOLUCAO,n
<int>,<int>
1.0,168402
2.0,87
3.0,219
4.0,16
9.0,5052
,9917


HOSPITALIZ,n
<int>,<int>
1.0,3923
2.0,115611
9.0,1991
,62168


### Delete columns

In [22]:
# this feature is correlated with uf 

data %>% select(-ID_MN_RESI) -> data

In [23]:
# check the presence of na's inside the columns of symptoms and comorbities

sum(is.na(data %>% select(FEBRE:AUTO_IMUNE)))

In [24]:
# the value 2 means the non-occurrence of a symptom, change it to 0

data %>% select(FEBRE:AUTO_IMUNE, HOSPITALIZ) -> symptoms
data %>% select(!(FEBRE:AUTO_IMUNE)) %>% select(-c(HOSPITALIZ)) -> others

symptoms %>% head

symptoms <- symptoms %>%
  mutate_all(~ ifelse(. == 2, 0, .))

symptoms %>% head

symptoms %>% cbind(others) -> data

data$EVOLUCAO[data$EVOLUCAO == 1] <- 0
data$EVOLUCAO[data$EVOLUCAO == 2] <- 1

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,2,1,2,2,2,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,2,1,2,1,1,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,2,1,2,2,1,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,1,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,2,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0


FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,1,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,0,1,0,1,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,0,1,0,0,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,1,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,0,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0


EVOLUCAO,n
<dbl>,<int>
0.0,168402
1.0,87
3.0,219
4.0,16
9.0,5052
,9917


HOSPITALIZ,n
<dbl>,<int>
0.0,115611
1.0,3923
9.0,1991
,62168


### Filter 9: keep only patients in the acute stage of the disease

In [25]:
# only patients in the acute stage 

data %>% filter(CLINC_CHIK == 1) -> data
data %>% select(-CLINC_CHIK) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<dbl>,<int>
0.0,133202
1.0,84
3.0,102
4.0,11
9.0,2202
,3123


HOSPITALIZ,n
<dbl>,<int>
0.0,92380
1.0,2548
9.0,1224
,42572


In [26]:
# date of the first symptoms 

# change format

data$DT_SIN_PRI <- as.Date(data$DT_SIN_PRI)

# first and last date

data %>% select(DT_SIN_PRI) %>% arrange(DT_SIN_PRI) %>% slice(1)
data %>% select(DT_SIN_PRI) %>% arrange(desc(DT_SIN_PRI)) %>% slice(1)

# change date format

data %>% mutate(SEM_PRI = SEM_PRI - 202200) -> data

data %>% colnames

DT_SIN_PRI
<date>
2022-01-02


DT_SIN_PRI
<date>
2022-12-31


In [27]:
colnames(data) <- c("fever", 'myalgia','headache',
                  'exanthema', 'vomiting','nausea',
                  'back_pain','conjunctivitis', 'arthritis', 
                  'arthralgia', 'petechiae', 'leukopenia', 
                  'lasso_prove', 'retro_orbital_pain', 'diabetes', 
                  'hematological_diseases', 'hepatopathies', 'chronic_kidney_disease', 
                  'arterial_hypertension', 'acid_peptic_disease', 'autoimmune_diseases',
                  'hospitalization','id_state','id_place',
                  'dt_first_symptoms', 'epidemiological_week', 'race', 
                  'schooling_years', 'dt_hospitalization', 'chikungunya', 
                  'death', 'dt_death', 'age', 
                  'sex', 'pregnancy')
                  
                  

In [28]:
data %>% write_csv('clean_chikungunya_22.csv')

# Dataset 1 - predicting hospitalization

### Filter 1.1: keep only patients with the disease confirmed

In [29]:
data <- read.csv('clean_chikungunya_22.csv')

# only confirmed cases
data %>% filter(chikungunya == 13) -> data

data %>% count(hospitalization)

data %>% dim

hospitalization,n
<int>,<int>
0.0,70101
1.0,1804
9.0,1065
,34355


### Filter 1.2: keep only patients with information about hospitalization

In [30]:
# remove patients without information of hospitalization (9 - not known)

data %>% count(hospitalization)

data %>% filter(hospitalization == 1 | hospitalization == 0) -> data

data %>% dim

hospitalization,n
<int>,<int>
0.0,70101
1.0,1804
9.0,1065
,34355


In [31]:
# create a column with the number of days between the disease investigation and death

data$dt_first_symptoms <- as.Date(data$dt_first_symptoms)
data$dt_hospitalization <- as.Date(data$dt_hospitalization)

data %>% mutate(days = difftime(dt_hospitalization, dt_first_symptoms, units = 'days')) -> data

data$days <- as.numeric(data$days)

### Filter 1.3: remove patients hospitalized more than 15 days after the first sympthoms

In [32]:
# remove patients hospitalized more than 15 days after first sympthoms

data %>% filter((days >= 0 & days < 16)| is.na(days)) -> data

data %>% count(days)

data %>% count(hospitalization)
data %>% dim
data %>% colnames

days,n
<dbl>,<int>
0.0,195
1.0,221
2.0,159
3.0,120
4.0,94
5.0,105
6.0,93
7.0,77
8.0,44
9.0,27


hospitalization,n
<int>,<int>
0,70101
1,1681


### Filter 1.4: remove non-hospitalized patients that died

In [33]:
# remove non-hospitalized patients that died

data %>% filter(hospitalization == 0) %>% count(death)

data %>% filter(hospitalization == 1) -> death
data %>% filter(hospitalization == 0) %>% filter(death == 0) -> data

data %>% rbind(death) -> data

data %>% dim 

data %>% count(hospitalization)

death,n
<int>,<int>
0.0,68005
1.0,23
3.0,6
4.0,1
9.0,1164
,902


hospitalization,n
<int>,<int>
0,68005
1,1681


### Remove columns: not relevant for this dataset

In [34]:
data %>% select(-c(dt_first_symptoms, death, dt_hospitalization, dt_death, days,)) -> data 

data %>% dim 

# check how many patients present a each symptoms 

colSums(data)

In [35]:
# check symptoms presence in hospitalized cases

data %>% filter(hospitalization == 1) -> hosp
hosp %>% colSums

### Delete columns: delete symptons and comorbidities with low frequency in the hospitalized class

In [36]:
selected_cols <- colnames(hosp)[colSums(hosp) > 50]

selected_cols

data <- data[, selected_cols]

In [37]:
data %>% colnames 

In [38]:
data %>% dim

In [39]:
data %>% select(fever:arterial_hypertension,id_state:pregnancy, hospitalization) %>% 
write_csv('chikungunya_hospitalization_22.csv')

# Dataset 2 - predicting the dignose

In [40]:
data <- read.csv('clean_chikungunya_22.csv')

# only confirmed cases
data %>% count(chikungunya)

chikungunya,n
<int>,<int>
5,31399
13,107325


In [41]:
data$chikungunya[data$chikungunya == 13] <- 1
data$chikungunya[data$chikungunya == 5] <- 0

In [42]:
data %>% colnames

### Remove columns: not relevant for this dataset or comorbidities with low presence

In [43]:
data %>% select(-c(dt_first_symptoms, death, dt_hospitalization, dt_death, 
                   hospitalization, hematological_diseases,hepatopathies, chronic_kidney_disease,
                  acid_peptic_disease, autoimmune_diseases)) -> data

In [44]:
data %>% dim

In [45]:
data %>% select(fever:schooling_years,age:pregnancy,chikungunya) %>%  write_csv('chikungunya_diagnose_22.csv')

# Dataset 3: predicting death (among hospitalized patients)

In [46]:
data <- read.csv('clean_chikungunya_22.csv')

### Filter 3.1: only confirmed cases of the disease

In [47]:
# only confirmed cases
data %>% filter(chikungunya == 13) -> data
data %>% select(-chikungunya) -> data

data %>% count(death)

data %>% dim

death,n
<int>,<int>
0.0,103093
1.0,83
3.0,22
4.0,11
9.0,1626
,2490


### Filter 3.2: only patients cured or dead with the disease 

1- cura (cured), 
2- óbito pelo agravo (death by chikungunya)
3- óbito por outras causas (death by other reasons)
4- óbito em investigação (death under investigation)
9- ignorado (ignored)

In [48]:
# only cured or death by the disease

data %>% count(death)
data %>% filter(death == 1 | death == 0) -> data

data %>% dim

data %>% count(death)

death,n
<int>,<int>
0.0,103093
1.0,83
3.0,22
4.0,11
9.0,1626
,2490


death,n
<int>,<int>
0,103093
1,83


### Filter 3.3: keep only hospitalized patients

In [49]:
# filter hospitalized patients

data %>% filter(hospitalization == 1) %>% count(death)
data %>% filter(hospitalization == 1) -> data

death,n
<int>,<int>
0,1567
1,52


In [50]:
# create a column with the number of days between the disease investigation and death

data$dt_hospitalization <- as.Date(data$dt_hospitalization)
data$dt_death <- as.Date(data$dt_death)

data %>% mutate(days = difftime(dt_death, dt_hospitalization, units = 'days')) -> data

data$days <- as.numeric(data$days)

data %>% count(days)

days,n
<dbl>,<int>
0.0,2
1.0,5
2.0,7
3.0,1
4.0,3
5.0,5
7.0,2
9.0,1
10.0,1
11.0,1


### Filter 3.4: remove patients that died with more than 30 days after hospital admission

In [51]:
# remove patients that died after 30 days of the first sympthoms

data %>% filter(days < 31| is.na(days)) -> data

data %>% count(death)
data %>% dim
data %>% colnames

death,n
<int>,<int>
0,1567
1,44


### Remove columns: not relevant for this dataset

In [52]:
data %>% select(-c(dt_first_symptoms, hospitalization, dt_hospitalization, dt_death, days)) -> data
data %>% dim

# check how many patients present a each symptoms 

colSums(data)

In [53]:
# check symptoms presence in death cases

data %>% filter(death == 1) -> death
death %>% colSums

### Remove columns: remove symptoms present in less 10% of the hospitalized patients

In [54]:
selected_cols <- colnames(death)[colSums(death) > 4]

selected_cols

data <- data[, selected_cols]

data %>% colnames

data %>% dim

data %>% count(death)

death,n
<int>,<int>
0,1567
1,44


In [55]:
# change data order

data %>% select(fever:schooling_years, age, sex, death) -> data

# write the dataset

data %>% write_csv('chikungunya_death_22.csv')