# Data Mining Project - Covid19 Data Analysis

link for download: https://coronavirus.es.gov.br/painel-covid-19-es.

Unpack 'MICRODADOS.csv' and put the file in a new directory named 'raw_data'

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("raw_data/MICRODADOS.csv", delimiter=";", encoding='latin-1', on_bad_lines='skip')
pd.set_option('display.max_columns', None)

In [3]:
col_pt = list(df.columns.values)

In [4]:
col_en = ['Notification Date', 'Date register', 'Diagnosis Date', 'DataCollection RT_PCR',
          'DataCollection QuickTest', 'DataCollection Serology', 'DataCollection SerologyIGG', 'Closing Date',
          'Expiry Date', 'Classification', 'Evolution', 'Confirmation Criteria', 'Status Notification',
          'Municipality', 'Neighborhood', 'Age range', 'AgeOnDateNotification', 'Gender', 'RaceColor',
          'Education Level', 'Pregnancy', 'Fever', 'Respiratory Difficulty', 'Cough', 'Coryza', 'Sore throat',
          'Diarrhoea', 'CEPHALGIA', 'ComorbidityLung', 'ComorbidityCardio', 'Renal comorbidity',
          'comorbidity diabetes', 'Comorbidity Smoking', 'comorbidity obesity', 'was hospitalized',
          'Travel Brazil', 'International trip', 'professional health', 'has a disability', 'Homeless',
          'RT_PCR Result', 'QuickTestResult', 'Serology Result', 'IGG Serology Result', 'Rapid Test Type']

In [5]:
df = df.rename(columns=dict(zip(col_pt, col_en)))

In [6]:
un_classification = list(df['Classification'].unique())
print(un_classification)
un_classification_en = ['Confirmed','Discarded','Suspect']
print(un_classification_en)
classification = dict(zip(un_classification,un_classification_en))

un_evolution = list(df['Evolution'].unique())
print(un_evolution)
un_evolution_en = ['Cure','Ignored','-','Death from other causes','Death from COVID-19']
print(un_evolution_en)
evolution = dict(zip(un_evolution,un_evolution_en))

un_confirmation = list(df['Confirmation Criteria'].unique())
print(un_confirmation)
un_confirmation_en = ['Laboratorial', 'Clinic', '-', 'Epdemiological Clinic']
print(un_confirmation_en)
confirmation = dict(zip(un_confirmation, un_confirmation_en))

un_status_notification = list(df['Status Notification'].unique())
print(un_status_notification)
un_status_notification_en = ['Closed', 'Open']
print(un_status_notification_en)
status_notification = dict(zip(un_status_notification, un_status_notification_en))

un_age_range = list(df['Age range'].unique())
print(un_age_range)
un_age_range_en = ['60-69','70-79','40-49','20-29','50-59','30-39','0-4','5-9','80-89','10-19','90-100','-']
print(un_age_range_en)
age_range = dict(zip(un_age_range,un_age_range_en))

un_pregnancy = list(df['Pregnancy'].unique())
print(un_pregnancy)
un_pregnancy_en = ['No', 'No', '1st trimester', '2nd trimester', '3rd trimester',
                   'Ignored', 'Gestational age ignored']
print(un_pregnancy_en)
pregnancy = dict(zip(un_pregnancy,un_pregnancy_en))

un_international_trip = list(df['International trip'].unique())
print(un_international_trip)

un_professional_health = list(df['professional health'].unique()) #professione sanitaria
print(un_professional_health)

un_disability = list(df['has a disability'].unique())
print(un_disability)

un_homeless = list(df['Homeless'].unique())
print(un_homeless)

un_pcr_res = list(df['RT_PCR Result'].unique())
print(un_pcr_res)

un_quicktest_res = list(df['QuickTestResult'].unique())
print(un_quicktest_res)

un_serology_res = list(df['Serology Result'].unique())
print(un_serology_res)

un_serology_igg_res = list(df['IGG Serology Result'].unique())
print(un_serology_igg_res
     )
un_rapidtest_type = list(df['Rapid Test Type'].unique())
print(un_rapidtest_type)

un_data_list_pt = np.unique(un_international_trip + un_professional_health + un_disability +
                                           un_homeless + un_pcr_res + un_quicktest_res + un_serology_res +
                                           un_serology_igg_res + un_rapidtest_type)
print(list(un_data_list_pt))
un_data_list_en = ['Ignored', 'Inconclusive', 'Negative', 'No', 'Not Informed', 'Non-Reagent', 'Positive',
                   'Reagent', 'Yes', 'Antigen rapid test', 'IgM/IgG rapid test']
print(un_data_list_en)

data_list = dict(zip(un_data_list_pt,un_data_list_en))

d = {**evolution, **classification, **confirmation, **status_notification, **age_range, **pregnancy, **data_list}

['Confirmados', 'Descartados', 'Suspeito']
['Confirmed', 'Discarded', 'Suspect']
['Cura', 'Ignorado', '-', 'Óbito por outras causas', 'Óbito pelo COVID-19']
['Cure', 'Ignored', '-', 'Death from other causes', 'Death from COVID-19']
['Laboratorial', 'Clinico', '-', 'Clinico Epdemiologico']
['Laboratorial', 'Clinic', '-', 'Epdemiological Clinic']
['Encerrado', 'Em Aberto']
['Closed', 'Open']
['60 a 69 anos', '70 a 79 anos', '40 a 49 anos', '20 a 29 anos', '50 a 59 anos', '30 a 39 anos', '0 a 4 anos', '05 a 9 anos', '80 a 89 anos', '10 a 19 anos', '90 anos ou mais', '-']
['60-69', '70-79', '40-49', '20-29', '50-59', '30-39', '0-4', '5-9', '80-89', '10-19', '90-100', '-']
['Não se aplica', 'Não', '1º trimeste', '2º trimeste', '3º trimeste', 'Ignorado', 'Idade gestacional ignorada']
['No', 'No', '1st trimester', '2nd trimester', '3rd trimester', 'Ignored', 'Gestational age ignored']
['Não Informado', 'Não', 'Ignorado', 'Sim']
['Não', 'Sim', 'Ignorado', 'Não Informado']
['Não', 'Sim', 'Não I

In [7]:
print(len(d))

39


In [8]:
df['AgeOnDateNotification'] = df['AgeOnDateNotification'].transform(lambda x: int(x.split()[0]))

In [9]:
df = df.drop(['RaceColor','Travel Brazil'], axis=1)

In [10]:
col_def_en = list(df.columns.values)

In [11]:
for elem in col_def_en:
    df[elem].replace(d, inplace=True)

In [12]:
df = df.drop(['Notification Date','Date register','DataCollection RT_PCR','DataCollection QuickTest',
              'DataCollection Serology','DataCollection SerologyIGG', 'Expiry Date', 'Serology Result',
              'IGG Serology Result','Rapid Test Type'],axis=1)

In [13]:
for elem in list(df.columns.values):
    df[elem].replace({"Not Informed":"Unknown",
                     "Death from other causes":"Death",
                      "Death from COVID-19":"Death",
                     "-":"Unknown", "Ignored":"Unknown"},inplace=True)

In [14]:
df.dropna(subset=['Neighborhood'], inplace=True)
df = df.loc[df['Neighborhood'] != "Não Encontrado"]
df['Municipality'] = df[['Municipality', 'Neighborhood']].apply(tuple, axis=1)
df = df.rename(columns={'Municipality':'Position'})
df = df.drop(['Neighborhood'],axis=1)

In [15]:
education_level_en = ['Complete elementary school',
        'Incomplete high school',
        'Complete higher education',
        'Complete high school',
        '4th grade of elementary school',
        'Incomplete 5th to 8th grade of elementary school',
        'Unknown', 'Unknown',
        'Incomplete 1st to 4th grade of elementary school',
        'Incomplete higher education', 'Illliterate']
education = dict(zip(df['Education Level'].unique(), education_level_en))
df['Education Level'].replace(education, inplace=True)

In [16]:
df = df.loc[df["Gender"] != "I"].reset_index()
df = df.drop(['index'],axis=1)

In [17]:
df = df.loc[df["Age range"] != "Unknown"].reset_index()
df = df.drop(["index"],axis=1)
df = df.rename(columns={"AgeOnDateNotification":"Age"})

In [18]:
df.to_csv("cleaned_data/brazilian_data_en.csv", index=False)

In [21]:
df

Unnamed: 0,Diagnosis Date,Closing Date,Classification,Evolution,Confirmation Criteria,Status Notification,Position,Age range,Age,Gender,Education Level,Pregnancy,Fever,Respiratory Difficulty,Cough,Coryza,Sore throat,Diarrhoea,CEPHALGIA,ComorbidityLung,ComorbidityCardio,Renal comorbidity,comorbidity diabetes,Comorbidity Smoking,comorbidity obesity,was hospitalized,International trip,professional health,has a disability,Homeless,RT_PCR Result,QuickTestResult
0,2021-04-23,2021-05-05,Confirmed,Cure,Laboratorial,Closed,"(SERRA, JARDIM ATLÂNTICO)",60-69,64,M,Complete elementary school,No,Yes,No,Yes,Yes,No,Yes,No,Yes,Yes,No,No,No,No,Unknown,Unknown,No,No,No,Unknown,Unknown
1,2021-12-24,2021-12-20,Discarded,Unknown,Laboratorial,Closed,"(PIUMA, JARDIM MAILY)",70-79,74,F,Incomplete high school,No,Yes,No,No,No,No,Yes,Yes,No,Yes,No,Yes,No,No,No,No,No,No,No,Negative,Unknown
2,2021-12-21,,Confirmed,Unknown,Laboratorial,Open,"(GUARAPARI, ITAPEBUSSU)",40-49,41,F,Complete higher education,No,Yes,No,Yes,Yes,Yes,Yes,Yes,No,No,No,No,No,No,Unknown,No,No,No,No,Unknown,Negative
3,2021-12-23,2020-12-19,Suspect,Unknown,Laboratorial,Closed,"(PIUMA, CENTRO)",20-29,22,M,Complete high school,No,No,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,No,No,No,No,Unknown,Unknown
4,2021-12-24,,Suspect,Unknown,Clinic,Open,"(VITORIA, BONFIM)",40-49,42,F,Complete high school,No,No,No,Yes,Yes,No,No,Yes,No,No,No,No,No,No,Unknown,Unknown,Yes,No,No,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2208172,2020-02-15,2020-04-16,Discarded,Unknown,Laboratorial,Closed,"(VILA VELHA, JOCKEY DE ITAPARICA)",30-39,39,F,Unknown,No,Yes,Yes,Yes,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Unknown,Unknown,Negative,Unknown
2208173,2020-02-15,2020-04-16,Discarded,Unknown,Laboratorial,Closed,"(VILA VELHA, JOCKEY DE ITAPARICA)",0-4,4,F,Unknown,No,Yes,No,Yes,No,No,No,No,No,No,No,No,No,No,Unknown,No,No,Unknown,Unknown,Negative,Unknown
2208174,2020-02-25,2020-04-15,Discarded,Unknown,Laboratorial,Closed,"(ARACRUZ, JEQUITIBÁ)",20-29,25,M,Unknown,No,Yes,No,No,Yes,No,No,No,No,No,No,No,No,No,Unknown,Yes,No,Unknown,Unknown,Negative,Unknown
2208175,2020-02-13,2020-04-16,Discarded,Death,Laboratorial,Closed,"(SERRA, BAIRRO DE FÁTIMA)",50-59,54,M,Unknown,No,Yes,Yes,Yes,No,Yes,No,No,No,No,No,No,No,No,No,Yes,No,No,No,Negative,Unknown
