This notebook contains code the steps performed for data inspection and filtering, with the goal of creating aclassification datasets suitable for machine learning predictive tasks. Specifically, we replicate the steps used for the 2022 database, with filters numbers following it. This enables us to use the resulting datasets as avalidation set for models trained with data from the main dataset (2022). For comprehensive details about these datasets, including information about data collection methods, sources, and a dictionary of variables, please refer to the project repository on GitHub: https://github.com/gabivaleriano/HealthDataBR

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
data <- read.csv("CHIKBR20.csv")

### Filter 1: remove duplicates

In [3]:
# check and remove duplicates

data %>% dim
data %>% unique %>% dim
data %>% unique -> data

In [4]:
# inspecting data distribution according to state

data %>% count(SG_UF_NOT)

SG_UF_NOT,n
<int>,<int>
11,1079
12,275
13,124
14,155
15,1968
16,44
17,427
21,250
22,295
23,3757


In [5]:
# visualize data

data %>% head
data %>% colnames

Unnamed: 0_level_0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,⋯,HEMATURA,SANGRAM,LACO_N,PLASMATICO,EVIDENCIA,PLAQ_MENOR,CON_FHD,COMPLICA,TP_SISTEMA,NDUPLIC_N
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<int>,<int>
1,2,A92.0,2020-06-04,202023,2020,12,120040,1938,2000490,2020-05-27,⋯,,,,,,,,,2,
2,2,A92.0,2020-06-01,202023,2020,12,120040,1938,2001578,2020-05-20,⋯,,,,,,,,,2,
3,2,A92.0,2020-09-24,202039,2020,12,120020,1941,6788637,2020-09-01,⋯,,,,,,,,,2,
4,2,A92.0,2020-06-02,202023,2020,12,120040,1938,2000075,2020-05-25,⋯,,,,,,,,,2,
5,2,A92.0,2020-02-05,202006,2020,12,120010,1937,2001500,2020-02-03,⋯,,,,,,,,,2,
6,2,A92.0,2020-01-29,202005,2020,12,120020,1941,6801099,2020-01-20,⋯,,,,,,,,,2,


In [6]:
# since ano-nasc is not present is this database, age will be computed from nu_idade_n

data %>% count(NU_IDADE_N) %>% head

Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,0,1
2,3,1
3,9,1
4,10,1
5,19,1
6,27,2


In [7]:
# the information must contain at least 4 digits
# when it starts with 10 - hours, 20 - days, 30 - months, 40 - years 

# remove information with less than 4 digits

data %>% filter(NU_IDADE_N > 1000) %>% count(NU_IDADE_N) %>% head

data %>% filter(NU_IDADE_N > 1000) -> data



Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,1078,1
2,2001,868
3,2002,20
4,2003,16
5,2004,9
6,2005,17


In [8]:
# remove information that is not coherent

data %>% filter(!(NU_IDADE_N > 2030 & NU_IDADE_N < 3000)) %>% filter(NU_IDADE_N> 2025) %>% 
    count(NU_IDADE_N) %>% head

data %>% filter(!(NU_IDADE_N > 2030 & NU_IDADE_N < 3000)) -> data


Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,2026,5
2,2027,4
3,2028,6
4,2029,9
5,2030,4
6,3001,100


In [9]:
# remove information that is not coherent

data %>% filter(!(NU_IDADE_N > 3012 & NU_IDADE_N < 4000)) %>% filter(NU_IDADE_N> 3010) %>% 
    count(NU_IDADE_N) %>% head

data %>% filter(!(NU_IDADE_N > 3012 & NU_IDADE_N < 4000)) ->data


Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,3011,53
2,4001,415
3,4002,430
4,4003,501
5,4004,535
6,4005,625


In [10]:
# transform in years 

# if is less than 4000 the patient has not complete one year
data %>% mutate(age = ifelse(NU_IDADE_N < 4000, 0, (NU_IDADE_N - round(NU_IDADE_N, -2)))) %>% 
    select(age, NU_IDADE_N) %>% head

data %>% mutate(age = ifelse(NU_IDADE_N < 4000, 0, (NU_IDADE_N - 4000))) -> data

Unnamed: 0_level_0,age,NU_IDADE_N
Unnamed: 0_level_1,<dbl>,<dbl>
1,48,4048
2,7,4007
3,26,4026
4,44,4044
5,8,4008
6,11,4011


In [11]:
# inspect possible target features

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,62702
2.0,30
3.0,166
4.0,21
9.0,5286
,34142


HOSPITALIZ,n
<int>,<int>
1.0,3217
2.0,49738
9.0,1806
,47586


In [12]:
# check how many regions, states and cities are present in the database

data %>% select(ID_REGIONA) %>% unique %>% dim
data %>% select(ID_MUNICIP) %>% unique %>% dim
data %>% select(SG_UF_NOT) %>% unique %>% dim
data %>% select(SG_UF) %>% unique %>% dim

data %>% select(ID_MN_RESI) %>% unique %>% dim
data %>% select(ID_PAIS) %>% unique %>% dim
data %>% select(ID_RG_RESI) %>% unique %>% dim

data %>% select(ID_UNIDADE) %>% unique %>% dim

data %>% select(UF) %>% unique %>% dim
data %>% select(MUNICIPIO) %>% unique %>% dim

### Delete columns

In [13]:
#keep only id related with the city

data %>% select(-c(ID_REGIONA, ID_MUNICIP, SG_UF, ID_PAIS, ID_RG_RESI, UF, MUNICIPIO)) -> data

# mostly NA values

data %>% count(TPAUTOCTO)
data %>% count(COPAISINF)
data %>% count(COMUNINF) 
data %>% count(TP_SISTEMA)

data %>% count(NDUPLIC_N)
data %>% count(COMPLICA)
data %>% count(CON_FHD)

data %>% count(SOROTIPO)
data %>% count(HISTOPA_N)
data %>% count(IMUNOH_N)
data %>% count(DOENCA_TRA)
data %>% count(ALRM_HIPOT)

data %>% count(TP_NOT)
data %>% count(ID_AGRAVO)
data %>% count(NU_ANO)


data %>% select(-c(TP_NOT, ID_AGRAVO, NU_ANO)) -> data
data %>% select(-c(TPAUTOCTO, COPAISINF, COUFINF, COMUNINF)) -> data
data %>% select(-c(TP_SISTEMA)) -> data
data %>% select(-c(NDUPLIC_N, COMPLICA, CON_FHD)) -> data
data %>% select(-c(SOROTIPO, HISTOPA_N, IMUNOH_N, DOENCA_TRA, ALRM_HIPOT)) -> data
data %>% select(-c(NU_IDADE_N)) -> data

# this dates usually are the same and refer to the date of digitalization

data %>% select(-c(DT_NOTIFIC, DT_ENCERRA, DT_INVEST, CRITERIO)) -> data

data %>% select(ALRM_PLAQ:ALRM_LIQ) %>% summary
data %>% select(GRAV_PULSO:GRAV_ORGAO) %>% summary
data %>% select(MANI_HEMOR:PLAQ_MENOR) %>% summary

data %>% select(-c(ALRM_PLAQ:PLAQ_MENOR)) -> data 

data %>% dim

data %>% colnames

TPAUTOCTO,n
<int>,<int>
1.0,25699
2.0,1598
3.0,2630
,72420


COPAISINF,n
<int>,<int>
1.0,26068
77.0,1
126.0,1
,76277


COMUNINF,n
<int>,<int>
110002,13
110004,19
110005,4
110009,1
110011,2
110012,1
110015,2
110020,12
110025,2
110029,1


TP_SISTEMA,n
<int>,<int>
2.0,100675
,1672


NDUPLIC_N,n
<int>,<int>
1.0,19
,102328


COMPLICA,n
<lgl>,<int>
,102347


CON_FHD,n
<lgl>,<int>
,102347


SOROTIPO,n
<lgl>,<int>
,102347


HISTOPA_N,n
<int>,<int>
4.0,23
,102324


IMUNOH_N,n
<int>,<int>
4.0,23
,102324


DOENCA_TRA,n
<lgl>,<int>
,102347


ALRM_HIPOT,n
<lgl>,<int>
,102347


TP_NOT,n
<int>,<int>
2,102347


ID_AGRAVO,n
<chr>,<int>
A92.0,102347


NU_ANO,n
<int>,<int>
2019,369
2020,101927
2021,51


 ALRM_PLAQ      ALRM_VOM       ALRM_SANG      ALRM_HEMAT     ALRM_ABDOM    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   
 ALRM_LETAR     ALRM_HEPAT     ALRM_LIQ      
 Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347   

 GRAV_PULSO     GRAV_CONV      GRAV_ENCH      GRAV_INSUF     GRAV_TAQUI    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   
 GRAV_EXTRE     GRAV_HIPOT     GRAV_HEMAT     GRAV_MELEN     GRAV_METRO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   
 GRAV_SANG      GRAV_AST       GRAV_MIOC      GRAV_CONSC     GRAV_ORGAO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   

 MANI_HEMOR     EPISTAXE       GENGIVO         METRO         PETEQUIAS     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   
 HEMATURA       SANGRAM         LACO_N        PLASMATICO     EVIDENCIA     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102347    NA's:102347    NA's:102347    NA's:102347    NA's:102347   
 PLAQ_MENOR    
 Mode:logical  
 NA's:102347   

### Filter 2: Remove if is na for state or health unit

In [14]:
data %>% filter(!is.na(SG_UF_NOT)) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,62702
2.0,30
3.0,166
4.0,21
9.0,5286
,34142


HOSPITALIZ,n
<int>,<int>
1.0,3217
2.0,49738
9.0,1806
,47586


### Remove columns 

In [15]:
# ID_OCUPA_N seems to not be filled consistently
# it refers to area of activity and should be filled with the same value (not aplicable) to children 
# sem_not is the epidemiologic week of notification, sem_pri is the epidemiologic week of the first symptoms

data %>% filter(age< 5) %>% count(ID_OCUPA_N) %>% slice(1:15)
data %>% select(-c(ID_OCUPA_N, SEM_NOT)) -> data

ID_OCUPA_N,n
<chr>,<int>
,3186
000000,4
021110,1
141405,1
141410,1
141705,1
223104,1
223115,1
2231F9,1
223208,2


### Filter 4: remove patients older than 110 years and younger than 12

In [16]:
# remove patients older than 110 years old 
data %>% filter(age < 111) -> data

# remove patients younger than 16 years old
data %>% filter(age > 11) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,57185
2.0,27
3.0,120
4.0,19
9.0,4776
,30925


HOSPITALIZ,n
<int>,<int>
1.0,2387
2.0,45771
9.0,1661
,43233


### Filter 5: remove patients without sex information

In [17]:
# remove patients without sex information

data %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') -> data
data %>% dim

# create a new column with numeric values

data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) %>% count(sex)
data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) -> data

# remove the former column
data %>% select(-CS_SEXO) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_SEXO,n
<chr>,<int>
F,56520
I,120
M,36412


CS_SEXO,n
<chr>,<int>
F,56520
M,36412


sex,n
<dbl>,<int>
0,36412
1,56520


EVOLUCAO,n
<int>,<int>
1.0,57149
2.0,27
3.0,120
4.0,19
9.0,4756
,30861


HOSPITALIZ,n
<int>,<int>
1.0,2383
2.0,45731
9.0,1658
,43160


In [18]:
# check the column pregnancy for each sex
# transform na values in the column pregnancy in 0 (not pregnant)

data %>% count(CS_GESTANT)

data %>% filter(sex == 1) %>% count(CS_GESTANT)

data %>% filter(sex == 0) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) -> data

CS_GESTANT,n
<int>,<int>
1.0,508
2.0,658
3.0,634
4.0,188
5.0,35188
6.0,42809
9.0,12926
,21


CS_GESTANT,n
<int>,<int>
1.0,508
2.0,658
3.0,634
4.0,188
5.0,35188
6.0,6397
9.0,12926
,21


CS_GESTANT,n
<int>,<int>
6,36412


CS_GESTANT,n
<dbl>,<int>
0,21
1,508
2,658
3,634
4,188
5,35188
6,42809
9,12926


In [19]:
# transform the column pregnant in binary, orignally diferent values refer to difernt stages on pregnancy

data %>% mutate(pregnant = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) %>% count(pregnant)


data %>% mutate(pregnant = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) -> data

data %>% select(-CS_GESTANT) -> data

pregnant,n
<dbl>,<int>
0,90944
1,1988


### Filter 6: remove patients without race information

In [20]:
# remove unknown race, or na values

data %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_RACA,n
<int>,<int>
1,14265
2,7481
3,870
4,46287
5,258
9,23771


CS_RACA,n
<int>,<int>
1,14265
2,7481
3,870
4,46287
5,258


EVOLUCAO,n
<int>,<int>
1.0,44013
2.0,22
3.0,101
4.0,16
9.0,2916
,22093


HOSPITALIZ,n
<int>,<int>
1.0,2066
2.0,36032
9.0,1159
,29904


In [21]:
# check year of schooling. 9 = unknown. 
data %>% count(CS_ESCOL_N)

data$CS_ESCOL_N[data$CS_ESCOL_N == 9] <- NA

data %>% count(CS_ESCOL_N)

CS_ESCOL_N,n
<int>,<int>
0.0,850
1.0,3101
2.0,1818
3.0,4609
4.0,2903
5.0,4144
6.0,12810
7.0,1481
8.0,4037
9.0,21657


CS_ESCOL_N,n
<int>,<int>
0.0,850
1.0,3101
2.0,1818
3.0,4609
4.0,2903
5.0,4144
6.0,12810
7.0,1481
8.0,4037
,33408


In [22]:
# how many groups by age, race, city and sex
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% n_groups

# create a new column with the average
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,973
1.0,3651
2.0,2416
3.0,5964
4.0,5244
5.0,8095
6.0,18216
7.0,2180
8.0,4708
,17714


In [23]:
# create a new column with the average
data %>% group_by(age, CS_RACA, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,1022
1.0,3820
2.0,3018
3.0,7748
4.0,8689
5.0,15329
6.0,22424
7.0,2221
8.0,4719
,171


### Filter 7: remove patients without schooling information (after input by group)

In [24]:
# if is still na, drop it

data %>% count(CS_ESCOL_N)

data %>% filter(!(is.na(CS_ESCOL_N))) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

CS_ESCOL_N,n
<dbl>,<int>
0.0,1022
1.0,3820
2.0,3018
3.0,7748
4.0,8689
5.0,15329
6.0,22424
7.0,2221
8.0,4719
,171


EVOLUCAO,n
<int>,<int>
1.0,43911
2.0,22
3.0,101
4.0,15
9.0,2912
,22029


HOSPITALIZ,n
<int>,<int>
1.0,2059
2.0,35956
9.0,1151
,29824


### Delete columns 

In [25]:
# this feature is correlated with uf 

data %>% select(-ID_MN_RESI) -> data

In [26]:
# check the presence of na's inside the columns of symptoms and comorbities

sum(is.na(data %>% select(FEBRE:AUTO_IMUNE)))

In [27]:
# check if the NA's are in the same rows

data %>% filter(!is.na(FEBRE)) -> teste
sum(is.na(teste %>% select(FEBRE:AUTO_IMUNE)))

### Filter: rows with NA for the symptoms

In [28]:
# filter rows with NA for the symptoms

data %>% filter(!is.na(FEBRE)) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,43909
2.0,22
3.0,101
4.0,15
9.0,2912
,17467


HOSPITALIZ,n
<int>,<int>
1.0,2059
2.0,35956
9.0,1151
,25260


In [29]:
# the value 2 means the non-occurrence of a symptom, change this to 0

data %>% select(FEBRE:AUTO_IMUNE, HOSPITALIZ) -> symptoms
data %>% select(!(FEBRE:AUTO_IMUNE)) %>% select(-c(HOSPITALIZ)) -> others

symptoms %>% head

symptoms <- symptoms %>%
  mutate_all(~ ifelse(. == 2, 0, .))

symptoms %>% head

symptoms %>% cbind(others) -> data

data$EVOLUCAO[data$EVOLUCAO == 1] <- 0
data$EVOLUCAO[data$EVOLUCAO == 2] <- 1

FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,1,1,2,2,2,1,2,2,2,⋯,1,2,2,2,2,2,2,2,2,2.0
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,1,1,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,2,2,2,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0


FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,1,0,0,0,1,0,0,0,⋯,1,0,0,0,0,0,0,0,0,0.0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0


### Filter 8: keep only patients in the acute stage of the disease

In [30]:
# only patients in the acute stage 

data %>% filter(CLINC_CHIK == 1) -> data
data %>% select(-CLINC_CHIK) -> data

In [31]:
# date of the first symptoms 

# change format

data$DT_SIN_PRI <- as.Date(data$DT_SIN_PRI)

# first and last date

data %>% select(DT_SIN_PRI) %>% arrange(DT_SIN_PRI) %>% slice(1)
data %>% select(DT_SIN_PRI) %>% arrange(desc(DT_SIN_PRI)) %>% slice(1)



DT_SIN_PRI
<date>
1968-12-30


DT_SIN_PRI
<date>
2021-01-02


### Filter: remove notifications from previous years

In [32]:
data %>% filter(SEM_PRI > 202000) -> data

# change date format

data %>% mutate(SEM_PRI = SEM_PRI - 202200) -> data

data %>% dim
data %>% colnames

In [33]:
data %>% select(-c('DT_CHIK_S1','DT_CHIK_S2','DT_PRNT',
                   'RES_CHIKS1','RES_CHIKS2','RESUL_PRNT',
                   'DT_SORO','RESUL_SORO','DT_NS1','RESUL_NS1','DT_VIRAL',
                   'RESUL_VI_N','DT_PCR','RESUL_PCR_')) -> data

In [34]:
colnames(data) <- c("fever", 'myalgia','headache',
                  'exanthema', 'vomiting','nausea',
                  'back_pain','conjunctivitis', 'arthritis', 
                  'arthralgia', 'petechiae', 'leukopenia', 
                  'lasso_prove', 'retro_orbital_pain', 'diabetes', 
                  'hematological_diseases', 'hepatopathies', 'chronic_kidney_disease', 
                  'arterial_hypertension', 'acid_peptic_disease', 'autoimmune_diseases',
                  'hospitalization','id_state','id_place',
                  'dt_first_symptoms', 'epidemiological_week', 
                  'race', 'schooling_years', 'dt_hospitalization', 
                  'chikungunya', 'death', 'dt_death', 
                  'age', 'sex', 'pregnancy')

### Filter 9: only patients cured or dead with the disease 

1- cura (cured), 2- óbito pelo agravo (death by chikungunya) 3- óbito por outras causas (death by other reasons) 4- óbito em investigação (death under investigation) 9- ignorado (ignored)

In [35]:
# only cured or dead by the disease

data %>% count(death)
data %>% filter(death == 1 | death == 0) -> data

data %>% dim

data %>% count(death)

death,n
<dbl>,<int>
0.0,32824
1.0,18
3.0,36
4.0,10
9.0,1611
,1260


death,n
<dbl>,<int>
0,32824
1,18


In [36]:
# filter hospitalized patients

data %>% filter(hospitalization == 1) %>% count(death)
data %>% filter(hospitalization == 1) -> data

death,n
<dbl>,<int>
0,780
1,15


In [37]:
# create a column with the number of days between the disease investigation and death

data$dt_hospitalization <- as.Date(data$dt_hospitalization)
data$dt_death <- as.Date(data$dt_death)

data %>% mutate(days = difftime(dt_death, dt_hospitalization, units = 'days')) -> data

data$days <- as.numeric(data$days)

data %>% count(days)

days,n
<dbl>,<int>
0.0,2
1.0,2
2.0,1
3.0,1
5.0,1
6.0,1
16.0,1
28.0,2
31.0,1
,783


### Filter 10: remove patients that died with more than 30 days after first symptoms

In [38]:
# remove patients that died after 30 days of the first sympthoms

data %>% filter(days < 31| is.na(days)) -> data

data %>% count(death)

death,n
<dbl>,<int>
0,780
1,14


In [39]:
data %>% select(c('fever','myalgia','headache','exanthema','vomiting','nausea','back_pain','arthritis',
                  'arthralgia','diabetes','chronic_kidney_disease','arterial_hypertension','id_state',
                  'id_place','epidemiological_week','race','schooling_years','death','age','sex')) -> data

In [40]:
data %>% colnames

data %>% dim

data %>% count(death)

data %>% select(fever:schooling_years, age, sex, death) -> data
data %>% write_csv('chikungunya_death_dataset_2020.csv')

death,n
<dbl>,<int>
0,780
1,14
