This notebook contains the steps performed for data inspection and filtering, with the goal of creating a classification dataset suitable for machine learning predictive models. The objective of the classification task is to predict hospitalization from the information contained in the moment of the disease notification. For more comprehensive details about the datasets, including information about data collection methods, sources, and a dictionary of variables, please visit the project repository on GitHub: https://github.com/gabivaleriano/HealthDataBR.

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# read data
data <- read.csv('CHIKBR22.csv')

“cannot open file 'CHIKBR22.csv': No such file or directory”


ERROR: Error in file(file, "rt"): cannot open the connection


### Filter 1: remove duplicates

In [None]:
# check and remove duplicates

data %>% dim
data %>% unique %>% dim
data %>% unique -> data

In [None]:
# inspecting data distribution according to state

data %>% count(SG_UF_NOT)

In [None]:
# visualize data

data %>% head
data %>% colnames

In [None]:
# inspect target features

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

In [None]:
# check how many regions, states and cities are present in the database

data %>% select(ID_REGIONA) %>% unique %>% dim
data %>% select(ID_MUNICIP) %>% unique %>% dim
data %>% select(SG_UF_NOT) %>% unique %>% dim
data %>% select(SG_UF) %>% unique %>% dim

data %>% select(ID_MN_RESI) %>% unique %>% dim
data %>% select(ID_PAIS) %>% unique %>% dim
data %>% select(ID_RG_RESI) %>% unique %>% dim

data %>% select(ID_UNIDADE) %>% unique %>% dim

data %>% select(UF) %>% unique %>% dim
data %>% select(MUNICIPIO) %>% unique %>% dim


### Delete columns

In [None]:
# keep only id_state

data %>% select(-c(ID_REGIONA, ID_MUNICIP, SG_UF, ID_PAIS, ID_RG_RESI, UF, MUNICIPIO, ID_UNIDADE)) -> data


# monstly NA values

data %>% count(TPAUTOCTO)
data %>% count(COPAISINF)
data %>% count(COMUNINF) 

data %>% count(NU_LOTE_I)
data %>% count(TP_SISTEMA)
data %>% count(CS_FLXRET)
data %>% count(FLXRECEBI)

data %>% count(MIGRADO_W)
data %>% count(NDUPLIC_N)
data %>% count(COMPLICA)
data %>% count(CON_FHD)

data %>% count(SOROTIPO)
data %>% count(HISTOPA_N)
data %>% count(IMUNOH_N)
data %>% count(DOENCA_TRA)
data %>% count(ALRM_HIPOT)

data %>% count(TP_NOT)
data %>% count(ID_AGRAVO)
data %>% count(NU_ANO)

data %>% select(-c(TPAUTOCTO, COPAISINF, COUFINF, COMUNINF)) -> data
data %>% select(-c(NU_LOTE_I, TP_SISTEMA, CS_FLXRET, FLXRECEBI, DT_DIGITA)) -> data
data %>% select(-c(MIGRADO_W, NDUPLIC_N, COMPLICA, CON_FHD)) -> data
data %>% select(-c(SOROTIPO, HISTOPA_N, IMUNOH_N, DOENCA_TRA, ALRM_HIPOT)) -> data
data %>% select(-c(TP_NOT, ID_AGRAVO, NU_ANO)) -> data

data %>% select(ALRM_PLAQ:ALRM_LIQ) %>% summary
data %>% select(GRAV_PULSO:GRAV_ORGAO) %>% summary
data %>% select(MANI_HEMOR:PLAQ_MENOR) %>% summary

data %>% select(-c(ALRM_PLAQ:PLAQ_MENOR)) -> data 

# NU_IDADE_N refers to age in a diferent format

data %>% select(-c(NU_IDADE_N)) -> data

# this dates usually are the same and refer to the date of digitalization

data %>% select(-c(DT_NOTIFIC, DT_ENCERRA, DT_INVEST, CRITERIO)) -> data

# we adopted the final classification to consider a positive case, remove test results and test dates

data %>% select(-c(DT_CHIK_S1:RESUL_PCR_)) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)
data %>% colnames

### Filter 2: remove patients without information of state 

In [None]:
data <- data %>%  filter(!is.na(SG_UF_NOT))

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

### Filter 3: remove patients without year of birth

In [None]:
data %>% filter(!is.na(ANO_NASC)) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

In [None]:
# create a new column with age

data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% slice(1:15)
data %>% mutate(age = 2022 - ANO_NASC) %>% count(age) %>% tail(12)
data %>% mutate(age = 2022 - ANO_NASC) -> data

### Remove columns

In [None]:
# ID_OCUPA_N seems to not be filled consistently
# it refers to area of activity and should be filled with the same value (not aplicable) to children 
# sem_not is the epidemiologic week of notification, we will adopt the epidemiologic week of the first symptoms

data %>% filter(age< 5) %>% count(ID_OCUPA_N) %>% slice(1:15)
data %>% select(-c(ID_OCUPA_N, SEM_NOT)) -> data

### Filter 4: remove patients older than 110 years and younger than 12

In [None]:
# remove patients older than 110 years old 

data %>% filter(age < 111) -> data

# remove patients younger than 16 years old

data %>% filter(age > 11) -> data

# remove the column year of birth

data %>% select(-ANO_NASC) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

### Filter 5: remove patients without sex information

In [None]:
# remove patients without sex information

data %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') %>% count(CS_SEXO)
data %>% filter(CS_SEXO == 'F' | CS_SEXO == 'M') -> data
data %>% dim

# create a new column with numeric values

data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) %>% count(sex)
data %>% mutate(sex = if_else(CS_SEXO == 'M', 0, 1)) -> data

# remove the former column
data %>% select(-CS_SEXO) -> data

data %>% dim
data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

In [None]:
# check the column pregnancy for each sex
# transform na values in the column pregnancy in 0 (not pregnant)

data %>% count(CS_GESTANT)

data %>% filter(sex == 1) %>% count(CS_GESTANT)

data %>% filter(sex == 0) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) %>% count(CS_GESTANT)

data %>% mutate(CS_GESTANT = if_else(is.na(CS_GESTANT), 0, CS_GESTANT)) -> data

In [None]:
# transform the column pregnant in binary, orignally diferent values refer to difernt stages on pregnancy

data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) %>% count(pregnancy)


data %>% mutate(pregnancy = if_else((CS_GESTANT == 1 | 
                               CS_GESTANT == 2 | 
                               CS_GESTANT == 3 | 
                               CS_GESTANT == 4), 1, 0)) -> data

data %>% select(-CS_GESTANT) -> data

### Filter 6: remove patients without race information

In [None]:
# remove unknown race, or na values

data %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) %>% count(CS_RACA)

data %>% filter(!is.na(CS_RACA)) %>% filter(CS_RACA < 9) -> data

data %>% dim

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

In [None]:
# check year of schooling. 9 = unknown. 

data %>% count(CS_ESCOL_N)

data$CS_ESCOL_N[data$CS_ESCOL_N == 9] <- NA

data %>% count(CS_ESCOL_N)

In [None]:
# how many groups by age, race, city and sex
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% n_groups

# create a new column with the average
data %>% group_by(age, CS_RACA, ID_MN_RESI, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na or 9 change by the average

data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

# if is still NA consider groups of age, race and sex

# create a new column with the average
data %>% group_by(age, CS_RACA, sex) %>% mutate(average = mean(CS_ESCOL_N, na.rm=TRUE)) %>% 
    ungroup -> data

# when CS_ESCOL_N is na change by the average
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) %>% count(CS_ESCOL_N)
data %>% mutate(CS_ESCOL_N = if_else((is.na(CS_ESCOL_N)), round(average), CS_ESCOL_N)) -> data
data %>% select(-average) -> data

### Filter 7: remove patients without schooling information (after input by group)

In [None]:
# if is still na, drop it

data %>% count(CS_ESCOL_N)

data %>% filter(!(is.na(CS_ESCOL_N))) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

### Delete columns

In [None]:
# this feature is correlated with uf 

data %>% select(-ID_MN_RESI) -> data

In [None]:
# check the presence of na's inside the columns of symptoms and comorbities

sum(is.na(data %>% select(FEBRE:AUTO_IMUNE)))

In [None]:
# the value 2 means the non-occurrence of a symptom, change it to 0

data %>% select(FEBRE:AUTO_IMUNE, HOSPITALIZ) -> symptoms
data %>% select(!(FEBRE:AUTO_IMUNE)) %>% select(-c(HOSPITALIZ)) -> others

symptoms %>% head

symptoms <- symptoms %>%
  mutate_all(~ ifelse(. == 2, 0, .))

symptoms %>% head

symptoms %>% cbind(others) -> data

data$EVOLUCAO[data$EVOLUCAO == 1] <- 0
data$EVOLUCAO[data$EVOLUCAO == 2] <- 1

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

### Filter 8: keep only patients in the acute stage of the disease

In [None]:
# only patients in the acute stage 

data %>% filter(CLINC_CHIK == 1) -> data
data %>% select(-CLINC_CHIK) -> data

data %>% count(EVOLUCAO)
data %>% count(HOSPITALIZ)

data %>% dim

In [None]:
# date of the first symptoms 

# change format

data$DT_SIN_PRI <- as.Date(data$DT_SIN_PRI)

# first and last date

data %>% select(DT_SIN_PRI) %>% arrange(DT_SIN_PRI) %>% slice(1)
data %>% select(DT_SIN_PRI) %>% arrange(desc(DT_SIN_PRI)) %>% slice(1)

# change date format

data %>% mutate(SEM_PRI = SEM_PRI - 202200) -> data

data %>% colnames

In [None]:
data %>% select(-CLASSI_FIN) -> data

In [None]:
colnames(data) <- c("fever", 'myalgia','headache',
                  'exanthema', 'vomiting','nausea',
                  'back_pain','conjunctivitis', 'arthritis', 
                  'arthralgia', 'petechiae', 'leukopenia', 
                  'lasso_prove', 'retro_orbital_pain', 'diabetes', 
                  'hematological_diseases', 'hepatopathies', 'chronic_kidney_disease', 
                  'arterial_hypertension', 'acid_peptic_disease', 'autoimmune_diseases',
                  'hospitalization','id_state',
                  'dt_first_symptoms', 'epidemiological_week', 'race', 
                  'schooling_years', 'dt_hospitalization',  
                  'death', 'dt_death', 'age', 
                  'sex', 'pregnancy')
                  
                  

In [None]:
data %>% count(hospitalization)

data %>% dim

### Filter 9: keep only patients with information about hospitalization

In [None]:
# remove patients without information of hospitalization (9 - not known)

data %>% count(hospitalization)

data %>% filter(hospitalization == 1 | hospitalization == 0) -> data

data %>% dim

In [None]:
# create a column with the number of days between the disease investigation and death

data$dt_first_symptoms <- as.Date(data$dt_first_symptoms)
data$dt_hospitalization <- as.Date(data$dt_hospitalization)

data %>% mutate(days = difftime(dt_hospitalization, dt_first_symptoms, units = 'days')) -> data

data$days <- as.numeric(data$days)

### Filter 10: remove patients hospitalized more than 15 days after the first sympthoms

In [None]:
# remove patients hospitalized more than 15 days after first sympthoms

data %>% filter((days >= 0 & days < 16)| is.na(days)) -> data

data %>% count(days)

data %>% count(hospitalization)
data %>% dim
data %>% colnames

### Filter 11: remove non-hospitalized patients that died

In [None]:
# remove non-hospitalized patients that died

data %>% filter(hospitalization == 0) %>% count(death)

data %>% filter(hospitalization == 1) -> death
data %>% filter(hospitalization == 0) %>% filter(death == 0) -> data

data %>% rbind(death) -> data

data %>% dim 

data %>% count(hospitalization)

### Delete columns: not relevant for this dataset

In [None]:
data %>% select(-c(dt_first_symptoms, death, dt_hospitalization, dt_death, days,)) -> data 

data %>% dim 

# check how many patients present a each symptoms 

colSums(data)

In [None]:
# check symptoms presence in hospitalized cases

data %>% filter(hospitalization == 1) -> hosp
hosp %>% colSums

### Delete columns: delete symptons and comorbidities with low frequency in the hospitalized class

In [None]:
selected_cols <- colnames(hosp)[colSums(hosp) > 50]

selected_cols

data <- data[, selected_cols]

In [None]:
data %>% colnames 

data %>% dim

In [None]:
data %>% select(fever:arterial_hypertension,id_state:pregnancy, hospitalization) %>% 
write_csv('chikungunya_hospitalization_22.csv')