# 1 - Importação e limpeza preliminar dos dados

In [2]:
import numpy as np
import pandas as pd

## Importação

In [136]:
covid_raw = pd.read_csv(r'../data/COVID.csv', index_col = 'Unnamed: 0')
covid_raw.index.name = 'id'

covid_raw.head()

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,1,,0.0,27,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,
1,0,1,,0.0,24,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,
2,1,0,0.0,0.0,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,1,0.0
3,0,0,0.0,1.0,30,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,0.0
4,1,0,0.0,0.0,60,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,,1,0.0


## Limpeza preliminar

Vamos fazer uma limpeza preliminar, antes de explorar os dados mais a fundo.

Os comandos que executarmos aqui serão parte de uma função de limpeza dos dados a serem executados no começo de cada *notebook* adicional que criarmos.

### Alteração de tipos

In [137]:
covid_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499692 entries, 0 to 499691
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   sex                  499692 non-null  int64  
 1   patient_type         499692 non-null  int64  
 2   intubed              107424 non-null  float64
 3   pneumonia            499681 non-null  float64
 4   age                  499692 non-null  int64  
 5   pregnancy            245258 non-null  float64
 6   diabetes             498051 non-null  float64
 7   copd                 498246 non-null  float64
 8   asthma               498250 non-null  float64
 9   inmsupr              498030 non-null  float64
 10  hypertension         498203 non-null  float64
 11  other_disease        497499 non-null  float64
 12  cardiovascular       498183 non-null  float64
 13  obesity              498222 non-null  float64
 14  renal_chronic        498216 non-null  float64
 15  tobacco          

Gastamos 76 MB de memória para armazenar todo o conjunto.

In [138]:
def acertar_tipos(df):
    df = df.copy()

    df = df.astype(pd.Int8Dtype())
    cols_minus_age = list(set(df.columns) - set(['age']))
    df[cols_minus_age] = df[cols_minus_age].astype('category')

    return df

covid_t1 = (covid_raw
    .pipe(acertar_tipos)
)

In [139]:
covid_t1.head()

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,1,,0,27,,0,0,0,0,0,0,0,0,0,0,0.0,1,
1,0,1,,0,24,,0,0,0,0,0,0,0,0,0,0,,1,
2,1,0,0.0,0,54,0.0,0,0,0,0,0,0,0,1,0,0,,1,0.0
3,0,0,0.0,1,30,,0,0,0,0,0,0,0,0,0,0,,1,0.0
4,1,0,0.0,0,60,0.0,1,0,0,0,1,0,1,0,0,0,,1,0.0


In [140]:
covid_t1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499692 entries, 0 to 499691
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   sex                  499692 non-null  category
 1   patient_type         499692 non-null  category
 2   intubed              107424 non-null  category
 3   pneumonia            499681 non-null  category
 4   age                  499692 non-null  Int8    
 5   pregnancy            245258 non-null  category
 6   diabetes             498051 non-null  category
 7   copd                 498246 non-null  category
 8   asthma               498250 non-null  category
 9   inmsupr              498030 non-null  category
 10  hypertension         498203 non-null  category
 11  other_disease        497499 non-null  category
 12  cardiovascular       498183 non-null  category
 13  obesity              498222 non-null  category
 14  renal_chronic        498216 non-null  category
 15  

Como convertemos os tipos de `float` (visto que há `NaN`s espalhados pelas colunas, o que força o `pandas` a usar o `float` como o tipo das colunas) para `category`, agora o conjunto requer 13 MB, uma redução de **82.5%**.

### Análise de *outliers* preliminar

Vamos ver se há algum *outlier*, ou seja, algum dados que claramente não é correto.

#### Idades

In [141]:
covid_t1.describe()

Unnamed: 0,age
count,499692.0
mean,42.545942
std,16.640391
min,0.0
25%,31.0
50%,41.0
75%,53.0
max,120.0


In [142]:
covid_t1[covid_t1.age > 100]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14671,0,1,,0,119,,0,0,0,0,0,0,0,1,0,0,0,1,
34008,1,0,0,1,101,0,0,0,0,0,1,0,0,0,0,0,0,1,0
43329,1,0,0,1,101,0,0,0,0,1,1,0,1,0,0,0,0,1,0
63447,0,1,,0,103,,0,0,0,0,1,0,0,0,0,0,,1,
66022,0,0,0,0,101,,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485541,1,1,,0,103,0,0,0,0,0,0,0,0,0,0,0,1,0,
486363,0,1,,0,108,,0,0,0,0,0,0,0,0,0,0,0,0,
489588,0,0,1,1,102,,0,0,0,0,0,0,0,0,0,0,0,0,0
490613,0,1,,0,111,,0,0,0,0,0,0,0,0,0,0,1,0,


In [143]:
covid_t1[covid_t1.age > 110]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14671,0,1,,0,119,,0,0,0,0,0,0.0,0,1,0,0,0.0,1,
86819,1,1,,0,114,0.0,0,0,0,0,0,0.0,0,0,0,0,1.0,1,
105172,0,1,,0,120,,0,0,0,0,0,,0,1,0,0,0.0,1,
207152,0,1,,0,116,,1,0,0,0,1,0.0,0,0,0,0,0.0,1,
210908,0,1,,0,120,,0,0,0,0,0,0.0,0,0,0,0,0.0,1,
251406,0,0,0.0,1,118,,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0
253200,0,1,,0,120,,0,0,0,0,0,0.0,0,0,0,0,,0,
262617,0,1,,0,120,,0,0,0,0,0,0.0,0,0,0,0,0.0,0,
287439,1,1,,0,117,0.0,0,0,0,0,0,0.0,0,0,0,0,1.0,0,
293895,1,0,0.0,1,115,0.0,0,0,0,0,0,0.0,0,0,0,0,0.0,0,0.0


In [144]:
idade_avancada = 90
covid_idosos = covid_t1[covid_t1.age >= idade_avancada]

# idosos sem doencas
covid_superidosos_mask = (
    (covid_t1.age > idade_avancada) & 
    (covid_t1.pneumonia == 0) & 
    (covid_t1.diabetes == 0) & 
    (covid_t1.asthma == 0) & 
    (covid_t1.hypertension == 0) & 
    (covid_t1.other_disease == 0) & 
    (covid_t1.cardiovascular == 0) & 
    (covid_t1.renal_chronic == 0)
)

covid_superidosos = covid_t1[covid_superidosos_mask]
covid_superidosos

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4746,0,1,,0,93,,0,0,0,0,0,0,0,0,0,0,1,1,
7573,1,1,,0,93,0,0,0,0,0,0,0,0,0,0,0,1,1,
9662,1,1,,0,94,0,0,0,0,0,0,0,0,0,0,0,0,1,
10559,1,1,,0,91,0,0,0,0,0,0,0,0,0,0,0,1,1,
11214,1,1,,0,91,0,0,0,0,0,0,0,0,1,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494508,1,0,0,0,93,0,0,0,0,0,0,0,0,0,0,0,,0,0
496045,1,1,,0,91,0,0,1,0,0,0,0,0,0,0,0,1,0,
496157,0,1,,0,94,,0,0,0,0,0,0,0,0,0,0,1,0,
498203,1,1,,0,98,0,0,0,0,0,0,0,0,0,0,0,1,0,


In [145]:
covid_superidosos.describe()

Unnamed: 0,age
count,386.0
mean,95.738342
std,5.440247
min,91.0
25%,92.0
50%,94.0
75%,97.0
max,120.0


Não parece haver nenhum *outlier* óbvio a respeito de idades avançadas.

Vamos ver se há *outliers* óbvios a respeito de bebês:

In [146]:
# criança grávida?
crianca_gravida_mask = (covid_t1.age < 12) & (covid_t1.pregnancy == 1)
covid_t1[crianca_gravida_mask]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
272278,1,1,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
318994,1,1,,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,
330427,1,1,,0,8,1,0,0,0,0,0,0,0,0,0,0,0,0,
402232,1,1,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
416157,1,1,,0,11,1,0,0,0,0,0,0,0,0,0,0,0,0,
480636,1,1,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,


Vamos por fim verificar se há inconsistências:

In [147]:
# homem grávido?
homem_gravido_mask = (covid_t1.sex == 0) & (covid_t1.pregnancy == 1)
covid_t1[homem_gravido_mask]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [148]:
# paciente foi dispensado para casa mas foi intubado?
dispensado_intubado_mask = (covid_t1.patient_type == 1) & (covid_t1.intubed == 1)
covid_t1[dispensado_intubado_mask]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [149]:
def remover_outliers(df):
    df = df.copy()

    outliers_mask = (
        ((covid_t1.age < 12) & (covid_t1.pregnancy == 1)) | # crianca gravida
        ((covid_t1.sex == 0) & (covid_t1.pregnancy == 1)) | # homem gravido
        ((covid_t1.patient_type == 1) & (covid_t1.intubed == 1))  # paciente dispensado e intubado
    )

    df = df.drop(index = df.index[outliers_mask])

    return df


covid_t2 = (covid_t1
    .pipe(remover_outliers)
)

In [150]:
covid_t2

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,1,,0,27,,0,0,0,0,0,0,0,0,0,0,0,1,
1,0,1,,0,24,,0,0,0,0,0,0,0,0,0,0,,1,
2,1,0,0,0,54,0,0,0,0,0,0,0,0,1,0,0,,1,0
3,0,0,0,1,30,,0,0,0,0,0,0,0,0,0,0,,1,0
4,1,0,0,0,60,0,1,0,0,0,1,0,1,0,0,0,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499687,0,1,,1,77,,0,0,0,0,0,1,0,0,0,1,0,0,
499688,0,0,1,1,63,,0,0,0,0,1,0,0,1,0,1,0,0,0
499689,1,1,,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,
499690,1,1,,0,45,0,0,0,0,0,1,0,0,0,0,0,1,0,


### Análise de duplicatas

Vamos ver se há duplicatas nos dados.

Temos que atentar para o fato de que, se todas as colunas forem perfeitamente independentes, há uma chance de...

In [151]:
nunique_cols = covid_t1.apply(lambda col: col.nunique(), axis = 0)
nunique_cols

sex                      2
patient_type             2
intubed                  2
pneumonia                2
age                    120
pregnancy                2
diabetes                 2
copd                     2
asthma                   2
inmsupr                  2
hypertension             2
other_disease            2
cardiovascular           2
obesity                  2
renal_chronic            2
tobacco                  2
contact_other_covid      2
covid_res                2
icu                      2
dtype: int64

In [152]:
total_prodcart = nunique_cols.prod()

print(f'...{1/total_prodcart:.7%} de haver dois pacientes com os mesmos atributos.')
print('\n')
print(f'Em um conjunto com {covid_t2.shape[0]} registros, esperamos que haja {covid_t2.shape[0] / total_prodcart:.3f} entradas duplicadas.')

...0.0000032% de haver dois pacientes com os mesmos atributos.


Em um conjunto com 499686 registros, esperamos que haja 0.016 entradas duplicadas.


Ou seja, entradas duplicadas não são esperadas, mas também não são evidência de incorreção.

In [158]:
covid_t2[covid_t2.duplicated(keep = 'first')]

Unnamed: 0_level_0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
22,1,1,,0,45,0,0,0,0,0,0,0,0,0,0,0,,1,
27,0,1,,0,40,,0,0,0,0,0,0,0,0,0,0,,1,
62,0,1,,0,40,,0,0,0,0,0,0,0,0,0,0,,1,
70,1,1,,0,33,0,0,0,0,0,0,0,0,0,0,0,,1,
76,1,1,,0,38,0,0,0,0,0,0,0,0,0,0,0,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499685,1,1,,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,
499686,1,1,,0,15,0,0,0,0,0,0,0,0,0,0,0,1,0,
499689,1,1,,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,
499690,1,1,,0,45,0,0,0,0,0,1,0,0,0,0,0,1,0,


In [81]:
orig_cols = covid_t2.columns
covid_t2['n_dup'] = covid_t2.groupby(list(orig_cols), dropna = False).cumcount() + 1

In [94]:
covid_t2[]

Unnamed: 0,sex,patient_type,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,other_disease,cardiovascular,obesity,renal_chronic,tobacco,contact_other_covid,covid_res,icu,n_dup
236095,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,223816
236745,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,224450
245255,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,232744
249067,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,236493
254839,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,242178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296720,1,1,,,22,0,0,0,0,0,0,0,0,0,0,0,1,0,,282467
446734,1,1,,,26,0,0,0,0,0,0,0,0,0,0,0,0,0,,427979
107211,1,1,,,37,0,0,0,0,0,0,0,0,0,0,0,,1,,101482
348721,1,1,,,46,0,0,0,0,0,0,0,0,0,0,0,0,0,,332894
