This notebook contains the steps performed for data inspection and filtering, with the goal of creating aclassification datasets suitable for machine learning predictive models. The objective of the classification task is to predict death from the information contained in the moment of the disease notification. For more comprehensive details about the datasets, including information about data collection methods, sources, and a dictionary of variables, please visit the project repository on GitHub: https://github.com/gabivaleriano/HealthDataBR.

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# read data
data <- read.csv('CHIKBR22.csv')

### Filter 1: remove duplicates

In [5]:
# check and remove duplicates

data %>% dim
data %>% unique %>% dim
data %>% unique -> data

In [6]:
# inspecting data distribution according to state

data %>% count(SG_UF_NOT)

SG_UF_NOT,n
<int>,<int>
11,502
12,207
13,470
14,427
15,1575
16,164
17,9139
21,3422
22,12280
23,80228


In [7]:
# visualize data

data %>% head
data %>% colnames

Unnamed: 0_level_0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,⋯,PLAQ_MENOR,CON_FHD,COMPLICA,NU_LOTE_I,TP_SISTEMA,NDUPLIC_N,DT_DIGITA,CS_FLXRET,FLXRECEBI,MIGRADO_W
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,⋯,<lgl>,<lgl>,<lgl>,<int>,<int>,<int>,<chr>,<lgl>,<lgl>,<lgl>
1,2,A92.0,2022-01-28,202204,2022,12,120033,1941,2000083,2022-01-23,⋯,,,,,2,,2022-03-18,,,
2,2,A92.0,2022-03-14,202211,2022,12,120040,1938,3689921,2022-03-01,⋯,,,,,2,,2022-03-21,,,
3,2,A92.0,2022-01-08,202201,2022,12,120033,1941,9542566,2022-01-07,⋯,,,,,2,,2022-03-16,,,
4,2,A92.0,2022-02-23,202208,2022,12,120033,1941,6568629,2022-02-16,⋯,,,,,2,,2022-03-18,,,
5,2,A92.0,2022-01-06,202201,2022,12,120040,1938,6119697,2022-01-05,⋯,,,,,2,,2022-01-11,,,
6,2,A92.0,2022-01-11,202202,2022,12,120040,1938,6119697,2022-01-10,⋯,,,,,2,,2022-01-19,,,


In [8]:
# inspect target features

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,215573
2.0,102
3.0,278
4.0,33
9.0,8244
,45593


HOSPITALIZ,n
<int>,<int>
1.0,6534
2.0,150163
9.0,3858
,109268


In [9]:
# check how many regions, states and cities are present in the database

data %>% select(ID_REGIONA) %>% unique %>% dim
data %>% select(ID_MUNICIP) %>% unique %>% dim
data %>% select(SG_UF_NOT) %>% unique %>% dim
data %>% select(SG_UF) %>% unique %>% dim

data %>% select(ID_MN_RESI) %>% unique %>% dim
data %>% select(ID_PAIS) %>% unique %>% dim
data %>% select(ID_RG_RESI) %>% unique %>% dim

data %>% select(ID_UNIDADE) %>% unique %>% dim

data %>% select(UF) %>% unique %>% dim
data %>% select(MUNICIPIO) %>% unique %>% dim


### Delete columns

In [10]:
# keep only id_state

data %>% select(-c(ID_REGIONA, ID_MUNICIP, SG_UF, ID_PAIS, ID_RG_RESI, UF, MUNICIPIO, ID_UNIDADE)) -> data


# monstly NA values

data %>% count(TPAUTOCTO)
data %>% count(COPAISINF)
data %>% count(COMUNINF) 

data %>% count(NU_LOTE_I)
data %>% count(TP_SISTEMA)
data %>% count(CS_FLXRET)
data %>% count(FLXRECEBI)

data %>% count(MIGRADO_W)
data %>% count(NDUPLIC_N)
data %>% count(COMPLICA)
data %>% count(CON_FHD)

data %>% count(SOROTIPO)
data %>% count(HISTOPA_N)
data %>% count(IMUNOH_N)
data %>% count(DOENCA_TRA)
data %>% count(ALRM_HIPOT)

data %>% count(TP_NOT)
data %>% count(ID_AGRAVO)
data %>% count(NU_ANO)

data %>% select(-c(TPAUTOCTO, COPAISINF, COUFINF, COMUNINF)) -> data
data %>% select(-c(NU_LOTE_I, TP_SISTEMA, CS_FLXRET, FLXRECEBI, DT_DIGITA)) -> data
data %>% select(-c(MIGRADO_W, NDUPLIC_N, COMPLICA, CON_FHD)) -> data
data %>% select(-c(SOROTIPO, HISTOPA_N, IMUNOH_N, DOENCA_TRA, ALRM_HIPOT)) -> data
data %>% select(-c(TP_NOT, ID_AGRAVO, NU_ANO)) -> data

data %>% select(ALRM_PLAQ:ALRM_LIQ) %>% summary
data %>% select(GRAV_PULSO:GRAV_ORGAO) %>% summary
data %>% select(MANI_HEMOR:PLAQ_MENOR) %>% summary

data %>% select(-c(ALRM_PLAQ:PLAQ_MENOR)) -> data 

# NU_IDADE_N refers to age in a diferent format

data %>% select(-c(NU_IDADE_N)) -> data

# this dates usually are the same and refer to the date of digitalization

data %>% select(-c(DT_NOTIFIC, DT_ENCERRA, DT_INVEST, CRITERIO)) -> data

# we adopted the final classification to consider a positive case, remove test results and test dates

data %>% select(-c(DT_CHIK_S1:RESUL_PCR_)) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)
data %>% colnames

TPAUTOCTO,n
<lgl>,<int>
,269823


COPAISINF,n
<lgl>,<int>
,269823


COMUNINF,n
<lgl>,<int>
,269823


NU_LOTE_I,n
<int>,<int>
0.0,6
,269817


TP_SISTEMA,n
<int>,<int>
2.0,265294
,4529


CS_FLXRET,n
<lgl>,<int>
,269823


FLXRECEBI,n
<lgl>,<int>
,269823


MIGRADO_W,n
<lgl>,<int>
,269823


NDUPLIC_N,n
<int>,<int>
1.0,22
,269801


COMPLICA,n
<lgl>,<int>
,269823


CON_FHD,n
<lgl>,<int>
,269823


SOROTIPO,n
<lgl>,<int>
,269823


HISTOPA_N,n
<int>,<int>
4.0,198
,269625


IMUNOH_N,n
<int>,<int>
3.0,1
4.0,197
,269625


DOENCA_TRA,n
<int>,<int>
2.0,3
,269820


ALRM_HIPOT,n
<lgl>,<int>
,269823


TP_NOT,n
<int>,<int>
2,269823


ID_AGRAVO,n
<chr>,<int>
A92.0,269823


NU_ANO,n
<int>,<int>
2022,269823


 ALRM_PLAQ      ALRM_VOM       ALRM_SANG      ALRM_HEMAT     ALRM_ABDOM    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 ALRM_LETAR     ALRM_HEPAT     ALRM_LIQ      
 Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823   

 GRAV_PULSO     GRAV_CONV      GRAV_ENCH      GRAV_INSUF     GRAV_TAQUI    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 GRAV_EXTRE     GRAV_HIPOT     GRAV_HEMAT     GRAV_MELEN     GRAV_METRO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 GRAV_SANG      GRAV_AST       GRAV_MIOC      GRAV_CONSC     GRAV_ORGAO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   

 MANI_HEMOR     EPISTAXE       GENGIVO         METRO         PETEQUIAS     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 HEMATURA       SANGRAM         LACO_N        PLASMATICO     EVIDENCIA     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:269823    NA's:269823    NA's:269823    NA's:269823    NA's:269823   
 PLAQ_MENOR    
 Mode:logical  
 NA's:269823   

EVOLUCAO,n
<int>,<int>
1.0,215573
2.0,102
3.0,278
4.0,33
9.0,8244
,45593


HOSPITALIZ,n
<int>,<int>
1.0,6534
2.0,150163
9.0,3858
,109268


### Filter 2: remove patients without information of state 

In [11]:
data <- data %>%  filter(!is.na(SG_UF_NOT))

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,215573
2.0,102
3.0,278
4.0,33
9.0,8244
,45593


HOSPITALIZ,n
<int>,<int>
1.0,6534
2.0,150163
9.0,3858
,109268


### Filter 3: remove patients without year of birth

In [12]:
data %>% filter(!is.na(ANO_NASC)) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,213821
2.0,100
3.0,277
4.0,33
9.0,8191
,45154


HOSPITALIZ,n
<int>,<int>
1.0,6499
2.0,149066
9.0,3837
,108174


In [13]:
# create a new column with age

data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% slice(1:15)
data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% tail(12)
data %>% mutate(age = 2022 - ANO_NASC) -> data

age,n
<dbl>,<int>
0,2289
1,1384
2,1271
3,1673
4,2007
5,2056
6,2110
7,2320
8,2371
9,2646


Unnamed: 0_level_0,age,n
Unnamed: 0_level_1,<dbl>,<int>
105,104,5
106,105,5
107,106,2
108,107,4
109,108,2
110,109,2
111,110,1
112,111,3
113,114,1
114,115,1


### Remove columns

In [14]:
# ID_OCUPA_N seems to not be filled consistently
# it refers to area of activity and should be filled with the same value (not aplicable) to children 
# sem_not is the epidemiologic week of notification, we will adopt the epidemiologic week of the first symptoms

data %>% filter(age< 5) %>% count(ID_OCUPA_N) %>% slice(1:15)
data %>% select(-c(ID_OCUPA_N, SEM_NOT)) -> data

ID_OCUPA_N,n
<chr>,<int>
,7085
0.0,27
21210.0,1
123305.0,1
141405.0,2
142105.0,1
142115.0,1
142325.0,1
221105.0,2
223115.0,2


### Filter 4: remove patients older than 110 years and younger than 12

In [15]:
# remove patients older than 110 years old 

data %>% filter(age < 111) -> data

# remove patients younger than 16 years old

data %>% filter(age > 11) -> data

# remove the column year of birth

data %>% select(-ANO_NASC) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,194348
2.0,93
3.0,235
4.0,24
9.0,7146
,39926


HOSPITALIZ,n
<int>,<int>
1.0,4853
2.0,135653
9.0,3522
,97744


### Filter 5: remove patients without sex information

In [16]:
# remove patients without sex information

data %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') -> data
data %>% dim

# create a new column with numeric values

data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) %>% count(sex)
data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) -> data

# remove the former column
data %>% select(-CS_SEXO) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_SEXO,n
<chr>,<int>
,1
F,149729
I,257
M,91785


CS_SEXO,n
<chr>,<int>
F,149729
M,91785


sex,n
<dbl>,<int>
0,91785
1,149729


EVOLUCAO,n
<int>,<int>
1.0,194162
2.0,93
3.0,235
4.0,24
9.0,7140
,39860


HOSPITALIZ,n
<int>,<int>
1.0,4850
2.0,135588
9.0,3517
,97559


In [17]:
# check the column pregnancy for each sex
# transform na values in the column pregnancy in 0 (not pregnant)

data %>% count(CS_GESTANT)

data %>% filter(sex == 1) %>% count(CS_GESTANT)

data %>% filter(sex == 0) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) -> data

CS_GESTANT,n
<int>,<int>
1.0,1023
2.0,1431
3.0,1335
4.0,488
5.0,104437
6.0,108815
9.0,23925
,60


CS_GESTANT,n
<int>,<int>
1.0,1023
2.0,1431
3.0,1335
4.0,488
5.0,104437
6.0,17030
9.0,23925
,60


CS_GESTANT,n
<int>,<int>
6,91785


CS_GESTANT,n
<dbl>,<int>
0,60
1,1023
2,1431
3,1335
4,488
5,104437
6,108815
9,23925


In [18]:
# transform the column pregnant in binary, orignally diferent values refer to difernt stages on pregnancy

data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) %>% count(pregnancy)


data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) -> data

data %>% select(-CS_GESTANT) -> data

pregnancy,n
<dbl>,<int>
0,237237
1,4277


### Filter 6: remove patients without race information

In [19]:
# remove unknown race, or na values

data %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

CS_RACA,n
<int>,<int>
1.0,32058
2.0,8255
3.0,2796
4.0,163708
5.0,952
9.0,33742
,3


CS_RACA,n
<int>,<int>
1,32058
2,8255
3,2796
4,163708
5,952


EVOLUCAO,n
<int>,<int>
1.0,168584
2.0,88
3.0,222
4.0,22
9.0,5080
,33773


HOSPITALIZ,n
<int>,<int>
1.0,4352
2.0,121217
9.0,2080
,80120


In [20]:
# check year of schooling. 9 = unknown. 

data %>% count(CS_ESCOL_N)

data$CS_ESCOL_N[data$CS_ESCOL_N == 9] <- NA

data %>% count(CS_ESCOL_N)

CS_ESCOL_N,n
<int>,<int>
0.0,2616
1.0,8215
2.0,4735
3.0,12054
4.0,8795
5.0,11195
6.0,35078
7.0,3412
8.0,9688
9.0,56393


CS_ESCOL_N,n
<int>,<int>
0.0,2616
1.0,8215
2.0,4735
3.0,12054
4.0,8795
5.0,11195
6.0,35078
7.0,3412
8.0,9688
,111981


In [21]:
# how many groups by age, race, city and sex
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% n_groups

# create a new column with the average
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na or 9 change by the average

data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

# if is still NA consider groups of age, race and sex

# create a new column with the average
data %>% group_by(age, CS_RACA, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,3120
1.0,9722
2.0,6822
3.0,16407
4.0,19371
5.0,28754
6.0,55185
7.0,6026
8.0,11724
,50638


CS_ESCOL_N,n
<dbl>,<int>
0.0,3177
1.0,9892
2.0,9307
3.0,22640
4.0,29918
5.0,48598
6.0,66329
7.0,6073
8.0,11733
,102


### Filter 7: remove patients without schooling information (after input by group)

In [22]:
# if is still na, drop it

data %>% count(CS_ESCOL_N)

data %>% filter(!(is.na(CS_ESCOL_N))) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

CS_ESCOL_N,n
<dbl>,<int>
0.0,3177
1.0,9892
2.0,9307
3.0,22640
4.0,29918
5.0,48598
6.0,66329
7.0,6073
8.0,11733
,102


EVOLUCAO,n
<int>,<int>
1.0,168505
2.0,87
3.0,221
4.0,22
9.0,5075
,33757


HOSPITALIZ,n
<int>,<int>
1.0,4349
2.0,121169
9.0,2078
,80071


### Delete columns

In [23]:
# this feature is correlated with uf 

data %>% select(-ID_MN_RESI) -> data

In [24]:
# check the presence of na's inside the columns of symptoms and comorbities

sum(is.na(data %>% select(FEBRE:AUTO_IMUNE)))

In [25]:
# the value 2 means the non-occurrence of a symptom, change it to 0

data %>% select(FEBRE:AUTO_IMUNE, HOSPITALIZ) -> symptoms
data %>% select(!(FEBRE:AUTO_IMUNE)) %>% select(-c(HOSPITALIZ)) -> others

symptoms %>% head

symptoms <- symptoms %>%
  mutate_all(~ ifelse(. == 2, 0, .))

symptoms %>% head

symptoms %>% cbind(others) -> data

data$EVOLUCAO[data$EVOLUCAO == 1] <- 0
data$EVOLUCAO[data$EVOLUCAO == 2] <- 1

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,2,1,2,2,2,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,2,1,2,1,1,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,2,1,2,2,1,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
1,1,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,2,1,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0


FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,0,1,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,0,1,0,1,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,0,1,0,0,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
1,1,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,0,1,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0


EVOLUCAO,n
<dbl>,<int>
0.0,168505
1.0,87
3.0,221
4.0,22
9.0,5075
,33757


HOSPITALIZ,n
<dbl>,<int>
0.0,121169
1.0,4349
9.0,2078
,80071


### Filter 8: keep only patients in the acute stage of the disease

In [26]:
# only patients in the acute stage 

data %>% filter(CLINC_CHIK == 1) -> data
data %>% select(-CLINC_CHIK) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<dbl>,<int>
0.0,133267
1.0,84
3.0,102
4.0,12
9.0,2206
,3190


HOSPITALIZ,n
<dbl>,<int>
0.0,92458
1.0,2556
9.0,1240
,42607


In [27]:
# date of the first symptoms 

# change format

data$DT_SIN_PRI <- as.Date(data$DT_SIN_PRI)

# first and last date

data %>% select(DT_SIN_PRI) %>% arrange(DT_SIN_PRI) %>% slice(1)
data %>% select(DT_SIN_PRI) %>% arrange(desc(DT_SIN_PRI)) %>% slice(1)

# change date format

data %>% mutate(SEM_PRI = SEM_PRI - 202200) -> data

data %>% colnames

DT_SIN_PRI
<date>
2022-01-02


DT_SIN_PRI
<date>
2022-12-31


In [28]:
data %>% select(-CLASSI_FIN) -> data

In [30]:
colnames(data) <- c("fever", 'myalgia','headache',
                  'exanthema', 'vomiting','nausea',
                  'back_pain','conjunctivitis', 'arthritis', 
                  'arthralgia', 'petechiae', 'leukopenia', 
                  'lasso_prove', 'retro_orbital_pain', 'diabetes', 
                  'hematological_diseases', 'hepatopathies', 'chronic_kidney_disease', 
                  'arterial_hypertension', 'acid_peptic_disease', 'autoimmune_diseases',
                  'hospitalization','id_state',
                  'dt_first_symptoms', 'epidemiological_week', 'race', 
                  'schooling_years', 'dt_hospitalization',  
                  'death', 'dt_death', 'age', 
                  'sex', 'pregnancy')
                  
                  

### Filter 9: only patients cured or dead with the disease 

1- cura (cured), 
2- óbito pelo agravo (death by chikungunya)
3- óbito por outras causas (death by other reasons)
4- óbito em investigação (death under investigation)
9- ignorado (ignored)

In [31]:
# only cured or death by the disease

data %>% count(death)
data %>% filter(death == 1 | death == 0) -> data

data %>% dim

data %>% count(death)

death,n
<dbl>,<int>
0.0,133267
1.0,84
3.0,102
4.0,12
9.0,2206
,3190


death,n
<dbl>,<int>
0,133267
1,84


In [32]:
# create a column with the number of days between the disease investigation and death

data$dt_first_symptoms <- as.Date(data$dt_first_symptoms)
data$dt_death <- as.Date(data$dt_death)

data %>% mutate(days = difftime(dt_death, dt_first_symptoms, units = 'days')) -> data

data$days <- as.numeric(data$days)

data %>% count(days)

days,n
<dbl>,<int>
0.0,2
1.0,3
2.0,5
3.0,2
4.0,1
5.0,6
6.0,5
7.0,4
8.0,6
9.0,3


### Filter 10: remove patients that died with more than 30 days after first symptoms

In [33]:
# remove patients that died after 30 days of the first sympthoms

data %>% filter(days < 31| is.na(days)) -> data

data %>% count(death)
data %>% dim
data %>% colnames

death,n
<dbl>,<int>
0,133267
1,67


### Delete columns: not relevant for this dataset

In [34]:
data %>% select(-c(dt_first_symptoms, hospitalization, dt_hospitalization, dt_death, days)) -> data
data %>% dim

# check how many patients present a each symptoms 

colSums(data)

In [35]:
# check symptoms presence in death cases

data %>% filter(death == 1) -> death
death %>% colSums

### Delete columns: remove symptoms present in less 10% of the hospitalized patients

In [36]:
selected_cols <- colnames(death)[colSums(death) > 6]

selected_cols

data <- data[, selected_cols]

data %>% colnames

data %>% dim

data %>% count(death)

death,n
<dbl>,<int>
0,133267
1,67


In [37]:
# change data order

data %>% select(fever:schooling_years, age, sex, death) -> data

# write the dataset

data %>% write_csv('chikungunya_death_dataset_2022.csv')