In [1]:
from datetime import datetime
from pathlib import Path 
from variables import *
from tqdm import tqdm
import pandas as pd 
import argparse
import os 

In [2]:
base_covid_fp = 'datasets/banco_completo_REGISTRO_COVID 28-08.xlsx'

In [3]:
base_covid = pd.read_excel(base_covid_fp)
base_covid_bak = base_covid.copy()

# __Patient filters__

## __Filtering age >= 18__

Exclude 'idade' < 18

In [4]:
n1 = base_covid.shape[0]
base_covid = base_covid[base_covid.idade >= 18].copy()
n2 = base_covid.shape[0]

print(f"Number of patients before: {n1}\nNumber of patients after: {n2}")

Number of patients before: 21558
Number of patients after: 21284


In [5]:
21558 - 21284

274

## __Filtering pregnant patients__

Exclude 'gestantes' = 1

In [6]:
n1 = base_covid.shape[0]
base_covid = base_covid[base_covid.gestante != 1].copy()
n2 = base_covid.shape[0]

print(f"Number of patients before: {n1}\nNumber of patients after: {n2}")

Number of patients before: 21284
Number of patients after: 20678


In [7]:
21284 - 20678

606

## __Filtering palliative patients__

Exclude 'paliativos' = 1 

Exclude 'paliativos_adm' = 1

In [8]:
n1 = base_covid.shape[0]
base_covid = base_covid[base_covid.paliativos != 1].copy()
base_covid = base_covid[base_covid.paliativos_adm != 1].copy()
n2 = base_covid.shape[0]

print(f"Number of patients before: {n1}\nNumber of patients after: {n2}")

Number of patients before: 20678
Number of patients after: 19247


In [9]:
20678 - 19247

1431

## __Filtering patients who developed COVID during hospitalization__

Exclude 'manifest_ih' = 1

Exclude 'motivo_adm' = 2

In [10]:
n1 = base_covid.shape[0]
base_covid = base_covid[base_covid.manifest_ih != 1].copy()
base_covid = base_covid[base_covid.motivo_adm != 2].copy()
n2 = base_covid.shape[0]

print(f"Number of patients before: {n1}\nNumber of patients after: {n2}")

Number of patients before: 19247
Number of patients after: 17516


In [11]:
19247 - 17516

1731

## __Filtering transfers to hospitals not participating in the study__

Exclude ('transf_inst' = 1 OU 2) E 'idhosp_pcte_transf'=vazio

Exclude 'transf' = 1 E 'idhosp_pcte_transf' = vazio

In [12]:
n1 = base_covid.shape[0]
base_covid = base_covid[~((base_covid.transf_inst.isin([1,2])) & 
                            (base_covid.idhosp_pcte_transf.isna()))].copy()
base_covid = base_covid[~((base_covid.transf == 1) & 
                            (base_covid.idhosp_pcte_transf.isna()))].copy()
n2 = base_covid.shape[0]

print(f"Number of patients before: {n1}\nNumber of patients after: {n2}")

Number of patients before: 17516
Number of patients after: 16957


In [13]:
17516 - 16957

559

# __Filtrar colunas__

In [14]:
base_covid_sociodemographic = base_covid[VARIABLES].copy()

In [15]:
base_covid_sociodemographic.columns.shape[0]

71

In [16]:
n1 = base_covid_sociodemographic.columns.shape[0]
colunas1 = base_covid_sociodemographic.columns

missingCounts = base_covid_sociodemographic.isna().mean(axis=0)
over50PercentMissingFilter = missingCounts[missingCounts <= 0.5].copy()
base_covid_sociodemographic = base_covid_sociodemographic.loc[:,over50PercentMissingFilter.index].copy()

n2 = base_covid_sociodemographic.columns.shape[0]
colunas2 = base_covid_sociodemographic.columns


colunasRetiradas = [c for c in colunas1 if c not in colunas2]

print(f"Number of columns before: {n1}\nNumber of columns after: {n2}")
print(f"Columns removed: {colunasRetiradas}")

Number of columns before: 71
Number of columns after: 67
Columns removed: ['bb_total_adm_final', 'ferritina_adm_final', 'troponina_adm_final_prop', 'vacina_doses']


# __Create outcome variable__

In [17]:
colsTotal = ['intercorrencia___13', 'intercorrencia___3', 'intercorrencia___16', 'intercorrencia___6', 'intercorrencia___5']

base_covid_sociodemographic["intercorrencia_3_5_6_13_16"] = (base_covid_sociodemographic[colsTotal]
                                       .eq(1)
                                       .any(axis=1)
                                       .where(base_covid_sociodemographic[colsTotal].notna().any(axis=1)))


# __Save dataset with sociodemographic variables__

In [18]:
base_covid_sociodemographic.to_parquet("datasets/banco_completo_REGISTRO_COVID_28_08_processado_cardiopatia_sociodemographic.parquet")

# __save dataset without sociodemographic variables__

In [19]:
base_covid_no_sociodemographic = base_covid_sociodemographic.drop(columns=SOCIODEMOGRAPHIC_VARIABLES).copy()
base_covid_no_sociodemographic.to_parquet("datasets/banco_completo_REGISTRO_COVID_28_08_processado_cardiopatia_no_sociodemographic.parquet")