# Imports and helper functions

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import seaborn as sns
import matplotlib.pyplot as plt
#import awswrangler as wr
import sweetviz as sv

def data_description(df):
    print('Variables:\n\n{}'.format(df.dtypes), end='\n\n')
    print('Number of rows {}'.format(df.shape[0]), end='\n\n')
    print('Number of columns {}'.format(df.shape[1]), end='\n\n')
    print('NA analysis'.format(end='\n'))
    for i in df.columns:
        print('column {}: {} {}'.format(i,df[i].isna().any(), df[i].isna().sum()))

# def consult_table_athena(database, table):
#     wr.config.aws_profile = 'default'
#     wr.config.region = 'us-east-1'

#     query = f"SELECT * FROM {database}.{table}"

#     df = wr.athena.read_sql_query(query, database=database)

#     return df


def unique_values_columns(df):
    """
    Display unique values for each object (or string) column in a DataFrame.
    
    Parameters:
    - df (DataFrame): Input DataFrame
    
    Returns:
    - dict: A dictionary with column names as keys and unique values as lists.
    """
    
    # Filter out only object or string type columns
    object_cols = df.select_dtypes(include=['object']).columns
    
    # Get unique values for each object column
    unique_values = {col: df[col].unique().tolist() for col in object_cols}
    
    return unique_values

# Loading Data

In [7]:
# DATABASE = 'respiratory_db'
# TABLE = 'table_respiratory_traintrain_data'
# df = consult_table_athena(DATABASE, TABLE)

# 
# df.to_parquet('train.parquet', index=False)

# df = pd.read_csv('../data/raw/train.csv', low_memory=False)
# df.to_parquet('train.parquet', index=False)

In [2]:
df_raw = pd.read_parquet('train.parquet')

## Data description

In [3]:
df_raw.columns = df_raw.columns.str.lower()

In [4]:
df_raw['classi_fin'].value_counts().sort_index(ascending=True)

classi_fin
1     15628
2     31437
3      5935
4    380410
5    867570
Name: count, dtype: int64

In [4]:
class1 = df_raw[df_raw['classi_fin'] == 1]
class2 = df_raw[df_raw['classi_fin'] == 2]
class3 = df_raw[df_raw['classi_fin'] == 3]
class4 = df_raw[df_raw['classi_fin'] == 4].sample(31437)
class5 = df_raw[df_raw['classi_fin'] == 5].sample(31437)
df = pd.concat([class1, class2, class3, class4, class5], ignore_index=True)
del(df_raw)

In [13]:
data_description(df)

Variables:

sem_not         int64
sem_pri         int64
sg_uf_not      object
id_regiona     object
co_regiona    float64
id_municip     object
co_mun_not      int64
cs_sexo        object
nu_idade_n      int64
tp_idade        int64
cod_idade      object
cs_gestant      int64
cs_raca         int64
cs_escol_n    float64
sg_uf          object
cs_zona       float64
surto_sg      float64
nosocomial    float64
ave_suino     float64
febre         float64
tosse         float64
garganta      float64
dispneia      float64
desc_resp     float64
saturacao     float64
diarreia      float64
vomito        float64
outro_sin     float64
outro_des      object
puerpera      float64
fator_risc      int64
cardiopati    float64
hematologi    float64
sind_down     float64
hepatica      float64
asma          float64
diabetes      float64
neurologic    float64
pneumopati    float64
imunodepre    float64
renal         float64
obesidade     float64
obes_imc       object
out_morbi     float64
morb_desc      objec

### Descriptive Statistics

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sem_not,115874.0,23.571284,15.369483,1.0,11.0,21.0,36.0,52.0
sem_pri,115874.0,23.991137,15.734857,1.0,11.0,21.0,37.0,52.0
co_regiona,102541.0,1552.980418,651.193515,1331.0,1342.0,1382.0,1519.0,6256.0
co_mun_not,115874.0,352918.193063,85952.620118,110002.0,311860.0,353870.0,411070.0,530010.0
nu_idade_n,115874.0,39.421907,31.383444,-9.0,5.0,42.0,68.0,136.0
tp_idade,115874.0,2.80887,0.422929,1.0,3.0,3.0,3.0,3.0
cs_gestant,115874.0,5.828909,0.797441,0.0,6.0,6.0,6.0,9.0
cs_raca,115874.0,3.543599,2.871792,1.0,1.0,4.0,4.0,9.0
cs_escol_n,74940.0,5.278596,3.384498,0.0,2.0,5.0,9.0,9.0
cs_zona,104910.0,1.214994,1.041631,1.0,1.0,1.0,1.0,9.0


### Data Cleaning / NA analysis / Outliers analysis

In [15]:
# Sexo have Male, F and I, lets take off the I value has there are just 188 lines.
df[df['cs_sexo'] != 'I']


# Negative ages are excluded
df[~df['nu_idade_n'] <= 0]

#Replace nan values to mode

df['cs_zona'] = df['cs_zona'].fillna(df['cs_zona'].mode()[0])
df['nosocomial'] = df['nosocomial'].fillna(df['nosocomial'].mode()[0])
df['ave_suino'] = df['ave_suino'].fillna(df['ave_suino'].mode()[0])
df['tosse'] = df['tosse'].fillna(df['tosse'].mode()[0])
df['garganta'] = df['garganta'].fillna(df['garganta'].mode()[0])
df['dispneia'] = df['dispneia'].fillna(df['dispneia'].mode()[0])
df['desc_resp'] = df['desc_resp'].fillna(df['desc_resp'].mode()[0])
df['saturacao'] = df['saturacao'].fillna(df['saturacao'].mode()[0])
df['diarreia'] = df['diarreia'].fillna(df['diarreia'].mode()[0])
df['vomito'] = df['vomito'].fillna(df['vomito'].mode()[0])
df['outro_sin'] = df['outro_sin'].fillna(df['outro_sin'].mode()[0])
df['puerpera'] = df['puerpera'].fillna(df['puerpera'].mode()[0])
df['fator_risc'] = df['fator_risc'].fillna(df['fator_risc'].mode()[0])
df['cardiopati'] = df['cardiopati'].fillna(df['cardiopati'].mode()[0])
df['hematologi'] = df['hematologi'].fillna(df['hematologi'].mode()[0])
df['sind_down'] = df['sind_down'].fillna(df['sind_down'].mode()[0])
df['hepatica'] = df['hepatica'].fillna(df['hepatica'].mode()[0])
df['asma'] = df['asma'].fillna(df['asma'].mode()[0])
df['diabetes'] = df['diabetes'].fillna(df['diabetes'].mode()[0])
df['neurologic'] = df['neurologic'].fillna(df['neurologic'].mode()[0])
df['pneumopati'] = df['pneumopati'].fillna(df['pneumopati'].mode()[0])
df['imunodepre'] = df['imunodepre'].fillna(df['imunodepre'].mode()[0])
df['renal'] = df['renal'].fillna(df['renal'].mode()[0])
df['obesidade'] = df['obesidade'].fillna(df['obesidade'].mode()[0])
df['out_morbi'] = df['out_morbi'].fillna(df['out_morbi'].mode()[0])
df['vacina'] = df['vacina'].fillna(df['vacina'].mode()[0])
df['mae_vac'] = df['mae_vac'].fillna(df['mae_vac'].mode()[0])
df['m_amamenta'] = df['m_amamenta'].fillna(df['m_amamenta'].mode()[0])
df['antiviral'] = df['antiviral'].fillna(df['antiviral'].mode()[0])
df['tp_antivir'] = df['tp_antivir'].fillna(df['tp_antivir'].mode()[0])
df['hospital'] = df['hospital'].fillna(df['hospital'].mode()[0])
df['uti'] = df['uti'].fillna(df['uti'].mode()[0])
df['suport_ven'] = df['suport_ven'].fillna(df['suport_ven'].mode()[0])
df['raiox_res'] = df['raiox_res'].fillna(df['raiox_res'].mode()[0])
df['tp_amostra'] = df['tp_amostra'].fillna(df['tp_amostra'].mode()[0])
df['dor_abd'] = df['dor_abd'].fillna(df['dor_abd'].mode()[0])
df['fadiga'] = df['fadiga'].fillna(df['fadiga'].mode()[0])
df['perd_olft'] = df['perd_olft'].fillna(df['perd_olft'].mode()[0])
df['perd_pala'] = df['perd_pala'].fillna(df['perd_pala'].mode()[0])
df['tomo_res'] = df['tomo_res'].fillna(df['tomo_res'].mode()[0])
df['vacina_cov'] = df['vacina_cov'].fillna(df['vacina_cov'].mode()[0])
df['surto_sg'] = df['surto_sg'].fillna(df['surto_sg'].mode()[0])
df['febre'] = df['febre'].fillna(df['febre'].mode()[0])
df['outro_sin'] = df['outro_sin'].fillna(df['outro_sin'].mode()[0])
df['pneumopati'] = df['pneumopati'].fillna(df['pneumopati'].mode()[0])
df['imunodepre'] = df['imunodepre'].fillna(df['imunodepre'].mode()[0])
df['amostra'] = df['amostra'].fillna(df['amostra'].mode()[0])
df['fnt_in_cov'] = df['fnt_in_cov'].fillna(df['fnt_in_cov'].mode()[0])

In [18]:
# Remove demographic categories

df = df.drop(
    [
    'sg_uf_not',
    'id_regiona',
    'co_regiona',
    'id_municip',
    'co_mun_not',
    'sg_uf',
    'cod_idade',
    'cs_escol_n'
    ], axis=1
)

# EDA

In [17]:
my_report = sv.analyze(df)
my_report.show_html()

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Mindmap hypothesis

1. Persons who did not take vaccine are the highest number of covid cases
2. Persons who live in urban areas has a higher proportion of covid cases
3. Persons who work with ave and suinos animals has a higher proportion respiratory diseases
4. People who has a older age has a higher tendency to respiratory disease
5. People who smoke did not have any respiratory disease
6. colder weeks has a higher proportion of respiratory disease
7. People who tomography suggested covid really had covid
8. People who took antiviral had less severe symptoms