In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
data <- read.csv("CHIKBR20.csv")

In [3]:
# check and remove duplicates

data %>% dim
data %>% unique %>% dim
data %>% unique -> data

In [4]:
# visualize data

data %>% head
data %>% colnames

Unnamed: 0_level_0,TP_NOT,ID_AGRAVO,DT_NOTIFIC,SEM_NOT,NU_ANO,SG_UF_NOT,ID_MUNICIP,ID_REGIONA,ID_UNIDADE,DT_SIN_PRI,⋯,HEMATURA,SANGRAM,LACO_N,PLASMATICO,EVIDENCIA,PLAQ_MENOR,CON_FHD,COMPLICA,TP_SISTEMA,NDUPLIC_N
Unnamed: 0_level_1,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,⋯,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<int>,<int>
1,2,A92.0,2020-06-04,202023,2020,12,120040,1938,2000490,2020-05-27,⋯,,,,,,,,,2,
2,2,A92.0,2020-06-01,202023,2020,12,120040,1938,2001578,2020-05-20,⋯,,,,,,,,,2,
3,2,A92.0,2020-09-24,202039,2020,12,120020,1941,6788637,2020-09-01,⋯,,,,,,,,,2,
4,2,A92.0,2020-06-02,202023,2020,12,120040,1938,2000075,2020-05-25,⋯,,,,,,,,,2,
5,2,A92.0,2020-02-05,202006,2020,12,120010,1937,2001500,2020-02-03,⋯,,,,,,,,,2,
6,2,A92.0,2020-01-29,202005,2020,12,120020,1941,6801099,2020-01-20,⋯,,,,,,,,,2,


In [5]:
# since ano-nasc is not present is this database, age will be computed from nu_idade_n

data %>% count(NU_IDADE_N) %>% head

Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,0,1
2,3,1
3,9,1
4,10,1
5,19,1
6,27,2


In [6]:
# the information must contain at least 4 digits
# when it starts with 10 - hours, 20 - days, 30 - months, 40 - years 

# remove information with less than 4 digits

data %>% filter(NU_IDADE_N > 1000) %>% count(NU_IDADE_N) %>% head

data %>% filter(NU_IDADE_N > 1000) -> data



Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,1078,1
2,2001,868
3,2002,20
4,2003,16
5,2004,9
6,2005,17


In [7]:
# remove information that is not coherent

data %>% filter(!(NU_IDADE_N > 2030 & NU_IDADE_N < 3000)) %>% filter(NU_IDADE_N> 2025) %>% 
    count(NU_IDADE_N) %>% head

data %>% filter(!(NU_IDADE_N > 2030 & NU_IDADE_N < 3000)) -> data


Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,2026,5
2,2027,4
3,2028,6
4,2029,9
5,2030,4
6,3001,100


In [8]:
# remove information that is not coherent

data %>% filter(!(NU_IDADE_N > 3012 & NU_IDADE_N < 4000)) %>% filter(NU_IDADE_N> 3010) %>% 
    count(NU_IDADE_N) %>% head

data %>% filter(!(NU_IDADE_N > 3012 & NU_IDADE_N < 4000)) ->data


Unnamed: 0_level_0,NU_IDADE_N,n
Unnamed: 0_level_1,<dbl>,<int>
1,3011,53
2,4001,415
3,4002,430
4,4003,501
5,4004,535
6,4005,625


In [9]:
# transform in years 

# if is less than 4000 the patient has not complete one year
data %>% mutate(age = ifelse(NU_IDADE_N < 4000, 0, (NU_IDADE_N - round(NU_IDADE_N, -2)))) %>% 
    select(age, NU_IDADE_N) %>% head

data %>% mutate(age = ifelse(NU_IDADE_N < 4000, 0, (NU_IDADE_N - 4000))) -> data

Unnamed: 0_level_0,age,NU_IDADE_N
Unnamed: 0_level_1,<dbl>,<dbl>
1,48,4048
2,7,4007
3,26,4026
4,44,4044
5,8,4008
6,11,4011


In [10]:
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)
data %>% count(CLASSI_FIN)

EVOLUCAO,n
<int>,<int>
1.0,62702
2.0,30
3.0,166
4.0,21
9.0,5286
,34142


HOSPITALIZ,n
<int>,<int>
1.0,3217
2.0,49738
9.0,1806
,47586


CLASSI_FIN,n
<int>,<int>
5.0,28478
13.0,43939
,29930


In [11]:
# check the content of some features

data %>% count(TP_NOT)
data %>% count(ID_AGRAVO)
data %>% count(NU_ANO)

# removed since they does not contain relevant information 

data %>% select(-c(TP_NOT, ID_AGRAVO, NU_ANO)) -> data

TP_NOT,n
<int>,<int>
2,102347


ID_AGRAVO,n
<chr>,<int>
A92.0,102347


NU_ANO,n
<int>,<int>
2019,369
2020,101927
2021,51


In [12]:
# check how many regions, states and cities are present in the database

data %>% select(ID_REGIONA) %>% unique %>% dim
data %>% select(ID_MUNICIP) %>% unique %>% dim
data %>% select(SG_UF_NOT) %>% unique %>% dim
data %>% select(SG_UF) %>% unique %>% dim

data %>% select(ID_MN_RESI) %>% unique %>% dim
data %>% select(ID_PAIS) %>% unique %>% dim
data %>% select(ID_RG_RESI) %>% unique %>% dim

data %>% select(ID_UNIDADE) %>% unique %>% dim

In [13]:
# refer to the place of hospitalization
data %>% select(UF) %>% unique %>% dim
data %>% select(MUNICIPIO) %>% unique %>% dim

# SG_UF refers to the state of residence while SG_UF_NOT refers to the sate where the notification was realised
# check in how many cases they are different 
data %>% filter(SG_UF != SG_UF_NOT) %>% dim

#keep only id related with the city
data %>% select(-c(ID_REGIONA, ID_MUNICIP, SG_UF, ID_PAIS, ID_RG_RESI, UF, MUNICIPIO)) -> data

data %>% dim

In [14]:
# remove patients without id_unidade or city
data %>% filter(!is.na(ID_UNIDADE)) -> data
data %>% filter(!is.na(ID_MN_RESI)) -> data
data %>% filter(!is.na(SG_UF_NOT)) -> data

data %>% dim

In [15]:
# refer to place of infection
data %>% select(TPAUTOCTO, COPAISINF, COMUNINF) %>% head

data %>% select(-c(TPAUTOCTO, COPAISINF, COUFINF, COMUNINF)) -> data

Unnamed: 0_level_0,TPAUTOCTO,COPAISINF,COMUNINF
Unnamed: 0_level_1,<int>,<int>,<int>
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,


In [16]:
# check and remove non-informative features

data %>% count(TP_SISTEMA)

data %>% select(-c(TP_SISTEMA)) -> data

TP_SISTEMA,n
<int>,<int>
2.0,100638
,1668


In [17]:
# check and remove non-informative features

data %>% count(NDUPLIC_N)

data %>% count(COMPLICA)

data %>% count(CON_FHD)

data %>% select(-c(NDUPLIC_N, COMPLICA, CON_FHD)) -> data

NDUPLIC_N,n
<int>,<int>
1.0,19
,102287


COMPLICA,n
<lgl>,<int>
,102306


CON_FHD,n
<lgl>,<int>
,102306


In [18]:
# check and remove non-informative features

data %>% count(SOROTIPO)

data %>% count(HISTOPA_N)

data %>% count(IMUNOH_N)

data %>% count(DOENCA_TRA)

data %>% count(ALRM_HIPOT)

data %>% select(-c(SOROTIPO, HISTOPA_N, IMUNOH_N, DOENCA_TRA, ALRM_HIPOT)) -> data

SOROTIPO,n
<lgl>,<int>
,102306


HISTOPA_N,n
<int>,<int>
4.0,23
,102283


IMUNOH_N,n
<int>,<int>
4.0,23
,102283


DOENCA_TRA,n
<lgl>,<int>
,102306


ALRM_HIPOT,n
<lgl>,<int>
,102306


In [19]:
# check and remove non-informative features

data %>% select(ALRM_PLAQ:ALRM_LIQ) %>% summary
data %>% select(GRAV_PULSO:GRAV_ORGAO) %>% summary
data %>% select(MANI_HEMOR:PLAQ_MENOR) %>% summary

data %>% select(-c(ALRM_PLAQ:PLAQ_MENOR)) -> data 

data %>% dim

data %>% colnames

 ALRM_PLAQ      ALRM_VOM       ALRM_SANG      ALRM_HEMAT     ALRM_ABDOM    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   
 ALRM_LETAR     ALRM_HEPAT     ALRM_LIQ      
 Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306   

 GRAV_PULSO     GRAV_CONV      GRAV_ENCH      GRAV_INSUF     GRAV_TAQUI    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   
 GRAV_EXTRE     GRAV_HIPOT     GRAV_HEMAT     GRAV_MELEN     GRAV_METRO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   
 GRAV_SANG      GRAV_AST       GRAV_MIOC      GRAV_CONSC     GRAV_ORGAO    
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   

 MANI_HEMOR     EPISTAXE       GENGIVO         METRO         PETEQUIAS     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   
 HEMATURA       SANGRAM         LACO_N        PLASMATICO     EVIDENCIA     
 Mode:logical   Mode:logical   Mode:logical   Mode:logical   Mode:logical  
 NA's:102306    NA's:102306    NA's:102306    NA's:102306    NA's:102306   
 PLAQ_MENOR    
 Mode:logical  
 NA's:102306   

In [20]:
# remove features - SEM_PRI is the epidemiologic week of the first symptoms
# NU_IDADE_N refers to age in a diferent format

data %>% select(-c(NU_IDADE_N)) -> data

In [21]:
# this dates usually are the same and refer to the date of digitalization
data %>% select(-c(DT_NOTIFIC, DT_ENCERRA, DT_INVEST, CRITERIO)) -> data

In [22]:
# patient diagnosed with chikungunya
data %>% count(CLASSI_FIN)

data %>% filter(CLASSI_FIN == 5 | CLASSI_FIN == 13) -> data

data %>% select(-c(DT_CHIK_S1:RESUL_PCR_)) -> data

data %>% dim

CLASSI_FIN,n
<int>,<int>
5.0,28473
13.0,43905
,29928


In [23]:
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,62575
2.0,29
3.0,164
4.0,16
9.0,5215
,4379


HOSPITALIZ,n
<int>,<int>
1.0,2717
2.0,42889
9.0,1533
,25239


In [24]:
data %>% colnames

In [25]:
# ID_OCUPA_N seems to not be filled consistently
# it refers to area of activity and should be filled with the same value (not aplicable) to children 
# sem_not is the epidemiologic week of notification, sem_pri is the epidemiologic week of the first symptoms
data %>% filter(age< 5) %>% count(ID_OCUPA_N) %>% slice(1:15)
data %>% select(-c(ID_OCUPA_N, SEM_NOT)) -> data

ID_OCUPA_N,n
<chr>,<int>
,2161
000000,2
141405,1
141410,1
223104,1
223115,1
2231F9,1
223208,1
223410,1
223505,4


In [26]:
# remove patients older than 110 years old 
data %>% filter(age < 111) -> data

# remove patients younger than 16 years old
data %>% filter(age > 11) -> data

data %>% dim

In [27]:
data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,57068
2.0,26
3.0,119
4.0,15
9.0,4711
,3992


HOSPITALIZ,n
<int>,<int>
1.0,1993
2.0,39536
9.0,1410
,22992


In [28]:
# remove patients without sex information

data %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') -> data
data %>% dim

# create a new column with numeric values

data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) %>% count(sex)
data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) -> data

# remove the former column
data %>% select(-CS_SEXO) -> data

data %>% dim

CS_SEXO,n
<chr>,<int>
F,40530
I,65
M,25336


CS_SEXO,n
<chr>,<int>
F,40530
M,25336


sex,n
<dbl>,<int>
0,25336
1,40530


In [29]:
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,57032
2.0,26
3.0,119
4.0,15
9.0,4691
,3983


HOSPITALIZ,n
<int>,<int>
1.0,1990
2.0,39512
9.0,1407
,22957


In [30]:
# check the column pregnancy for each sex
# transform na values in the column pregnancy in 0 (not pregnant)

data %>% count(CS_GESTANT)

data %>% filter(sex == 1) %>% count(CS_GESTANT)

data %>% filter(sex == 0) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) -> data

CS_GESTANT,n
<int>,<int>
1.0,388
2.0,513
3.0,500
4.0,139
5.0,26241
6.0,29921
9.0,8152
,12


CS_GESTANT,n
<int>,<int>
1.0,388
2.0,513
3.0,500
4.0,139
5.0,26241
6.0,4585
9.0,8152
,12


CS_GESTANT,n
<int>,<int>
6,25336


CS_GESTANT,n
<dbl>,<int>
0,12
1,388
2,513
3,500
4,139
5,26241
6,29921
9,8152


In [31]:
# transform the column pregnant in binary, orignally diferent values refer to difernt stages on pregnancy

data %>% mutate(pregnant = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) %>% count(pregnant)


data %>% mutate(pregnant = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) -> data

data %>% select(-CS_GESTANT) -> data

pregnant,n
<dbl>,<int>
0,64326
1,1540


In [32]:
# remove unknown race, or na values

data %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) -> data

data %>% dim

CS_RACA,n
<int>,<int>
1,11391
2,4773
3,721
4,32831
5,183
9,15967


CS_RACA,n
<int>,<int>
1,11391
2,4773
3,721
4,32831
5,183


In [33]:
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,43916
2.0,22
3.0,100
4.0,13
9.0,2885
,2963


HOSPITALIZ,n
<int>,<int>
1.0,1787
2.0,31993
9.0,976
,15143


In [34]:
# check year of schooling. 9 = unknown. 
data %>% count(CS_ESCOL_N)

data$CS_ESCOL_N[data$CS_ESCOL_N == 9] <- NA

data %>% count(CS_ESCOL_N)

CS_ESCOL_N,n
<int>,<int>
0.0,645
1.0,2303
2.0,1373
3.0,3532
4.0,2214
5.0,3154
6.0,9787
7.0,1218
8.0,3277
9.0,14750


CS_ESCOL_N,n
<int>,<int>
0.0,645
1.0,2303
2.0,1373
3.0,3532
4.0,2214
5.0,3154
6.0,9787
7.0,1218
8.0,3277
,22396


In [35]:
# how many groups by age, race, city and sex
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% n_groups

# create a new column with the average
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,730
1.0,2653
2.0,1802
3.0,4427
4.0,3783
5.0,5445
6.0,13291
7.0,1775
8.0,3774
,12219


In [36]:
# create a new column with the average
data %>% group_by(age, CS_RACA, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

CS_ESCOL_N,n
<dbl>,<int>
0.0,765
1.0,2764
2.0,2183
3.0,5749
4.0,6031
5.0,10184
6.0,16486
7.0,1810
8.0,3790
,137


In [37]:
data %>% select(-ID_MN_RESI) -> data

In [38]:
# if is still na, drop it

data %>% count(CS_ESCOL_N)

data %>% filter(!(is.na(CS_ESCOL_N))) -> data


CS_ESCOL_N,n
<dbl>,<int>
0.0,765
1.0,2764
2.0,2183
3.0,5749
4.0,6031
5.0,10184
6.0,16486
7.0,1810
8.0,3790
,137


In [39]:
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

EVOLUCAO,n
<int>,<int>
1.0,43797
2.0,22
3.0,100
4.0,12
9.0,2880
,2951


HOSPITALIZ,n
<int>,<int>
1.0,1785
2.0,31913
9.0,969
,15095


In [40]:
# check the presence of na's inside the columns of symptoms and comorbities
sum(is.na(data %>% select(FEBRE:AUTO_IMUNE)))

In [41]:
data %>% filter(!is.na(FEBRE)) -> teste
sum(is.na(teste %>% select(FEBRE:AUTO_IMUNE)))

In [42]:
data %>% filter(!is.na(FEBRE)) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

EVOLUCAO,n
<int>,<int>
1.0,43795
2.0,22
3.0,100
4.0,12
9.0,2880
,2951


HOSPITALIZ,n
<int>,<int>
1.0,1785
2.0,31913
9.0,969
,15093


In [43]:
# the value 2 means the non-occurrence of a symptom, change this to 0

data %>% select(FEBRE:AUTO_IMUNE, HOSPITALIZ) -> symptoms
data %>% select(!(FEBRE:AUTO_IMUNE)) %>% select(-c(HOSPITALIZ)) -> others

symptoms %>% head

symptoms <- symptoms %>%
  mutate_all(~ ifelse(. == 2, 0, .))

symptoms %>% head

symptoms %>% cbind(others) -> data

FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,1,1,2,2,2,1,2,2,2,⋯,1,2,2,2,2,2,2,2,2,2.0
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,1,1,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
2,2,2,2,2,2,2,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0
1,1,1,2,2,2,1,2,2,2,⋯,2,2,2,2,2,2,2,2,2,2.0


FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,⋯,LACO,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,HOSPITALIZ
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,1,0,0,0,1,0,0,0,⋯,1,0,0,0,0,0,0,0,0,0.0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,1,1,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0
1,1,1,0,0,0,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0.0


In [44]:
data$EVOLUCAO[data$EVOLUCAO == 1] <- 0
data$EVOLUCAO[data$EVOLUCAO == 2] <- 1

In [45]:
# only patients in the acute stage 

data %>% filter(CLINC_CHIK == 1) -> data
data %>% select(-CLINC_CHIK) -> data

In [46]:
# date of the first symptoms 

# change format
data$DT_SIN_PRI <- as.Date(data$DT_SIN_PRI)

# first and last date
data %>% select(DT_SIN_PRI) %>% arrange(DT_SIN_PRI) %>% slice(1)
data %>% select(DT_SIN_PRI) %>% arrange(desc(DT_SIN_PRI)) %>% slice(1)

DT_SIN_PRI
<date>
1968-12-30


DT_SIN_PRI
<date>
2021-01-02


In [47]:
data %>% count(SEM_PRI)

SEM_PRI,n
<int>,<int>
196901,1
197201,1
199248,1
200552,1
201307,1
201518,1
201524,1
201532,1
201548,1
201602,2


In [48]:
data %>% dim

In [49]:
data %>% filter(SEM_PRI > 202000) -> data

In [50]:
data %>% mutate(SEM_PRI = SEM_PRI - 202000) %>% count(SEM_PRI)

SEM_PRI,n
<dbl>,<int>
1,545
2,680
3,758
4,804
5,878
6,830
7,949
8,803
9,749
10,1007


In [51]:
data %>% colnames
data %>% dim

In [52]:
colnames(data) <- c("fever", 'myalgia','headache',
                  'exanthema', 'vomiting','nausea',
                  'back_pain','conjunctivitis', 'arthritis', 
                  'arthralgia', 'petechiae', 'leukopenia', 
                  'lasso_prove', 'retro_orbital_pain', 'diabetes', 
                  'hematological_diseases', 'hepatopathies', 'chronic_kidney_disease', 
                  'arterial_hypertension', 'acid_peptic_disease', 'autoimmune_diseases',
                  'hospitalization','id_state','id_place',
                  'dt_first_symptoms', 'epidemiological_week', 
                  'race', 'schooling_years', 'dt_hospitalization', 
                  'chikungunya', 'death', 'dt_death', 
                  'age', 'sex', 'pregnancy')

In [53]:
data %>% write_csv('clean_chikungunya_20.csv')

# Dataset1 - predicting hospitalization

In [64]:
data <- read.csv('clean_chikungunya_20.csv')

# only confirmed cases
data %>% filter(chikungunya == 13) -> data
data %>% select(-chikungunya) -> data

In [65]:
data %>% colnames

In [66]:
data %>% count(hospitalization)

data %>% dim

hospitalization,n
<int>,<int>
0.0,18837
1.0,640
9.0,569
,8469


In [67]:
# remove patients without information of hospitalization

data %>% count(hospitalization)

data %>% filter(hospitalization == 1 | hospitalization == 0) -> data

data %>% dim

hospitalization,n
<int>,<int>
0.0,18837
1.0,640
9.0,569
,8469


In [68]:
# create a column with the number of days between the disease investigation and death

data$dt_hospitalization <- as.Date(data$dt_hospitalization)
data$dt_hospitalization <- as.Date(data$dt_hospitalization)

data %>% mutate(days = difftime(dt_hospitalization, dt_first_symptoms, units = 'days')) -> data

data$days <- as.numeric(data$days)

# remove patients hospitalized more than 15 days after first sympthoms

data %>% filter((days >= 0 & days < 16)| is.na(days)) -> data

data %>% count(days)

data %>% select(-days) -> data

days,n
<dbl>,<int>
0.0,33
0.04166667,55
1.0,29
1.04166667,43
2.0,24
2.04166667,32
3.0,14
3.04166667,32
4.0,14
4.04166667,19


In [69]:
# remove non-hospitalized patients that died
data %>% filter(hospitalization == 0) %>% count(death)

death,n
<int>,<int>
0.0,17391
1.0,2
3.0,1
4.0,1
9.0,929
,513


In [70]:
data %>% filter(hospitalization == 1) -> death
data %>% filter(hospitalization == 0) %>% filter(death == 0) -> data

In [71]:
data %>% rbind(death) -> data

data %>% dim 

data %>% count(hospitalization)

hospitalization,n
<int>,<int>
0,17391
1,573


In [72]:
data %>% select('fever','myalgia','headache','exanthema','vomiting','nausea','back_pain','conjunctivitis',
    'arthritis','arthralgia','petechiae','leukopenia','retro_orbital_pain','diabetes','arterial_hypertension',
    'id_state','id_place','epidemiological_week','race','schooling_years','age','sex','pregnancy',
                'hospitalization') -> data

In [73]:
data %>% dim

In [74]:
data %>% write_csv('chikungunya_hospitalization_20.csv')

# Dataset2 - predicting the diagnose

In [75]:
data <- read.csv('clean_chikungunya_20.csv')

# only confirmed cases
data %>% count(chikungunya)

chikungunya,n
<int>,<int>
5,7031
13,28515


In [76]:
data$chikungunya[data$chikungunya == 13] <- 1
data$chikungunya[data$chikungunya == 5] <- 0

In [77]:
data %>% colnames

In [78]:
data %>% select('fever','myalgia','headache','exanthema','vomiting','nausea','back_pain','conjunctivitis',
    'arthritis','arthralgia','petechiae','leukopenia','lasso_prove','retro_orbital_pain','diabetes',
    'arterial_hypertension','id_state','id_place','epidemiological_week','race', 'schooling_years','chikungunya', 
    'age','sex','pregnancy') -> data

In [79]:
data %>% dim

In [80]:
data %>% colnames %>% as.list

In [81]:
data %>% select(fever:schooling_years,age:pregnancy,chikungunya) %>%  write_csv('chikungunya_diagnose_20.csv')

# Dataset3 - predicting death 

In [83]:
data <- read.csv('clean_chikungunya_20.csv')

# only confirmed cases
data %>% filter(chikungunya == 13) -> data
data %>% select(-chikungunya) -> data

In [84]:
data %>% count(death)

data %>% dim

death,n
<int>,<int>
0.0,26219
1.0,18
3.0,11
4.0,9
9.0,1286
,972


In [85]:
# only cured or dead by the disease

data %>% count(death)
data %>% filter(death == 1 | death == 0) -> data

data %>% dim

data %>% count(death)

death,n
<int>,<int>
0.0,26219
1.0,18
3.0,11
4.0,9
9.0,1286
,972


death,n
<int>,<int>
0,26219
1,18


In [86]:
# filter hospitalized patients

data %>% filter(hospitalization == 1) %>% count(death)
data %>% filter(hospitalization == 1) -> data

death,n
<int>,<int>
0,535
1,15


In [87]:
# create a column with the number of days between the disease investigation and death

data$dt_hospitalization <- as.Date(data$dt_hospitalization)
data$dt_death <- as.Date(data$dt_death)

data %>% mutate(days = difftime(dt_death, dt_hospitalization, units = 'days')) -> data

data$days <- as.numeric(data$days)

data %>% count(days)

# remove patients that died after 30 days of the first sympthoms

data %>% filter(days < 31| is.na(days)) -> data

data %>% count(death)

data %>% select(-days) -> data

days,n
<dbl>,<int>
0.0,2
1.0,2
2.0,1
3.0,1
5.0,1
6.0,1
16.0,1
28.0,2
31.0,1
,538


death,n
<int>,<int>
0,535
1,14


In [89]:
data %>% select(c('fever','myalgia','headache','exanthema','vomiting','nausea','back_pain','arthritis',
                  'arthralgia','diabetes','chronic_kidney_disease','arterial_hypertension','id_state',
                  'id_place','epidemiological_week','race','schooling_years','death','age','sex')) -> data

In [90]:
data %>% colnames

data %>% dim

data %>% count(death)

data %>% select(fever:schooling_years, age, sex, death) -> data
data %>% write_csv('chikungunya_hospitalized_20.csv')

death,n
<int>,<int>
0,535
1,14
