# Imports and helper functions

In [77]:
import pandas as pd
from swifter import swifter
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#import awswrangler as wr
import sweetviz as sv
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.inspection import permutation_importance
from sklearn import model_selection, metrics
from xgboost import XGBClassifier


def data_description(df):
    print('Variables:\n\n{}'.format(df.dtypes), end='\n\n')
    print('Number of rows {}'.format(df.shape[0]), end='\n\n')
    print('Number of columns {}'.format(df.shape[1]), end='\n\n')
    print('NA analysis'.format(end='\n'))
    for i in df.columns:
        print('column {}: {} {}'.format(i,df[i].isna().any(), df[i].isna().sum()))

# def consult_table_athena(database, table):
#     wr.config.aws_profile = 'default'
#     wr.config.region = 'us-east-1'

#     query = f"SELECT * FROM {database}.{table}"

#     df = wr.athena.read_sql_query(query, database=database)

#     return df


def unique_values_columns(df):
    """
    Display unique values for each object (or string) column in a DataFrame.
    
    Parameters:
    - df (DataFrame): Input DataFrame
    
    Returns:
    - dict: A dictionary with column names as keys and unique values as lists.
    """
    
    # Filter out only object or string type columns
    object_cols = df.select_dtypes(include=['object']).columns
    
    # Get unique values for each object column
    unique_values = {col: df[col].unique().tolist() for col in object_cols}
    
    return unique_values

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Loading Data

In [2]:
# DATABASE = 'respiratory_db'
# TABLE = 'table_respiratory_train_data'
# df = consult_table_athena(DATABASE, TABLE)

# df.to_parquet('train.parquet', index=False)

# df = pd.read_csv('../data/raw/train.csv', low_memory=False)
# df.to_parquet('train.parquet', index=False)

In [40]:
df_raw = pd.read_parquet('../data/raw/train.parquet')

## Data description

In [41]:
df_raw.columns = df_raw.columns.str.lower()

In [4]:
df_raw['classi_fin'].value_counts().sort_index(ascending=True)

classi_fin
1     15628
2     31437
3      5935
4    380410
5    867570
Name: count, dtype: int64

In [42]:
class1 = df_raw[df_raw['classi_fin'] == 1]
class2 = df_raw[df_raw['classi_fin'] == 2]
class3 = df_raw[df_raw['classi_fin'] == 3]
class4 = df_raw[df_raw['classi_fin'] == 4].sample(31437)
class5 = df_raw[df_raw['classi_fin'] == 5].sample(31437)
df = pd.concat([class1, class2, class3, class4, class5], ignore_index=True)
del(df_raw)

In [9]:
data_description(df)

Variables:

sem_not         int64
sem_pri         int64
sg_uf_not      object
id_regiona     object
co_regiona    float64
id_municip     object
co_mun_not      int64
cs_sexo        object
nu_idade_n      int64
tp_idade        int64
cod_idade      object
cs_gestant      int64
cs_raca         int64
cs_escol_n    float64
sg_uf          object
cs_zona       float64
surto_sg      float64
nosocomial    float64
ave_suino     float64
febre         float64
tosse         float64
garganta      float64
dispneia      float64
desc_resp     float64
saturacao     float64
diarreia      float64
vomito        float64
outro_sin     float64
outro_des      object
puerpera      float64
fator_risc      int64
cardiopati    float64
hematologi    float64
sind_down     float64
hepatica      float64
asma          float64
diabetes      float64
neurologic    float64
pneumopati    float64
imunodepre    float64
renal         float64
obesidade     float64
obes_imc       object
out_morbi     float64
morb_desc      objec

In [43]:
df = reduce_mem_usage(df)

Mem. usage decreased to 26.19 Mb (61.0% reduction)


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sem_not,115874.0,23.585023,15.369219,1.0,11.0,22.0,36.0,52.0
sem_pri,115874.0,24.024622,15.726942,1.0,11.0,21.0,37.0,52.0
co_regiona,102583.0,,,1331.0,1342.0,1382.0,1519.0,6256.0
co_mun_not,115874.0,352287.508785,85783.835254,110001.0,311510.0,353800.0,410940.0,530010.0
nu_idade_n,115874.0,39.512721,31.342504,-9.0,5.0,42.0,68.0,117.0
tp_idade,115874.0,2.810225,0.421424,1.0,3.0,3.0,3.0,3.0
cs_gestant,115874.0,5.831196,0.805223,0.0,6.0,6.0,6.0,9.0
cs_raca,115874.0,3.560445,2.882355,1.0,1.0,4.0,4.0,9.0
cs_escol_n,75078.0,,0.0,0.0,2.0,5.0,9.0,9.0
cs_zona,115874.0,,0.0,1.0,1.0,1.0,1.0,9.0


### Data Cleaning / NA analysis / Outliers analysis

In [44]:
# Sexo have Male, F and I, lets take off the I value has there are just 188 lines.
df = df[df['cs_sexo'] != 'I']
df['cs_sexo'] = df['cs_sexo'].astype('category')



# Negative ages are excluded
df = df[~df['nu_idade_n'] <= 0]

#Replace nan values to mode

# df['cs_zona'] = df['cs_zona'].fillna(df['cs_zona'].mode()[0])
# df['nosocomial'] = df['nosocomial'].fillna(df['nosocomial'].mode()[0])
# df['ave_suino'] = df['ave_suino'].fillna(df['ave_suino'].mode()[0])
# df['tosse'] = df['tosse'].fillna(df['tosse'].mode()[0])
# df['garganta'] = df['garganta'].fillna(df['garganta'].mode()[0])
# df['dispneia'] = df['dispneia'].fillna(df['dispneia'].mode()[0])
# df['desc_resp'] = df['desc_resp'].fillna(df['desc_resp'].mode()[0])
# df['saturacao'] = df['saturacao'].fillna(df['saturacao'].mode()[0])
# df['diarreia'] = df['diarreia'].fillna(df['diarreia'].mode()[0])
# df['vomito'] = df['vomito'].fillna(df['vomito'].mode()[0])
# df['outro_sin'] = df['outro_sin'].fillna(df['outro_sin'].mode()[0])
# df['puerpera'] = df['puerpera'].fillna(df['puerpera'].mode()[0])
# df['fator_risc'] = df['fator_risc'].fillna(df['fator_risc'].mode()[0])
# df['cardiopati'] = df['cardiopati'].fillna(df['cardiopati'].mode()[0])
# df['hematologi'] = df['hematologi'].fillna(df['hematologi'].mode()[0])
# df['sind_down'] = df['sind_down'].fillna(df['sind_down'].mode()[0])
# df['hepatica'] = df['hepatica'].fillna(df['hepatica'].mode()[0])
# df['asma'] = df['asma'].fillna(df['asma'].mode()[0])
# df['diabetes'] = df['diabetes'].fillna(df['diabetes'].mode()[0])
# df['neurologic'] = df['neurologic'].fillna(df['neurologic'].mode()[0])
# df['pneumopati'] = df['pneumopati'].fillna(df['pneumopati'].mode()[0])
# df['imunodepre'] = df['imunodepre'].fillna(df['imunodepre'].mode()[0])
# df['renal'] = df['renal'].fillna(df['renal'].mode()[0])
# df['obesidade'] = df['obesidade'].fillna(df['obesidade'].mode()[0])
# df['out_morbi'] = df['out_morbi'].fillna(df['out_morbi'].mode()[0])
# df['vacina'] = df['vacina'].fillna(df['vacina'].mode()[0])
# df['mae_vac'] = df['mae_vac'].fillna(df['mae_vac'].mode()[0])
# df['m_amamenta'] = df['m_amamenta'].fillna(df['m_amamenta'].mode()[0])
# df['antiviral'] = df['antiviral'].fillna(df['antiviral'].mode()[0])
# df['tp_antivir'] = df['tp_antivir'].fillna(df['tp_antivir'].mode()[0])
# df['hospital'] = df['hospital'].fillna(df['hospital'].mode()[0])
# df['uti'] = df['uti'].fillna(df['uti'].mode()[0])
# df['suport_ven'] = df['suport_ven'].fillna(df['suport_ven'].mode()[0])
# df['raiox_res'] = df['raiox_res'].fillna(df['raiox_res'].mode()[0])
# df['tp_amostra'] = df['tp_amostra'].fillna(df['tp_amostra'].mode()[0])
# df['dor_abd'] = df['dor_abd'].fillna(df['dor_abd'].mode()[0])
# df['fadiga'] = df['fadiga'].fillna(df['fadiga'].mode()[0])
# df['perd_olft'] = df['perd_olft'].fillna(df['perd_olft'].mode()[0])
# df['perd_pala'] = df['perd_pala'].fillna(df['perd_pala'].mode()[0])
# df['tomo_res'] = df['tomo_res'].fillna(df['tomo_res'].mode()[0])
# df['vacina_cov'] = df['vacina_cov'].fillna(df['vacina_cov'].mode()[0])
# df['surto_sg'] = df['surto_sg'].fillna(df['surto_sg'].mode()[0])
# df['febre'] = df['febre'].fillna(df['febre'].mode()[0])
# df['outro_sin'] = df['outro_sin'].fillna(df['outro_sin'].mode()[0])
# df['pneumopati'] = df['pneumopati'].fillna(df['pneumopati'].mode()[0])
# df['imunodepre'] = df['imunodepre'].fillna(df['imunodepre'].mode()[0])
# df['amostra'] = df['amostra'].fillna(df['amostra'].mode()[0])
# df['fnt_in_cov'] = df['fnt_in_cov'].fillna(df['fnt_in_cov'].mode()[0])

In [45]:
# Remove demographic categories

df = df.drop(
    [
    'sg_uf_not',
    'id_regiona',
    'co_regiona',
    'id_municip',
    'co_mun_not',
    'sg_uf',
    'cod_idade',
    'cs_escol_n'
    ], axis=1
)

In [78]:
float_cols = df.select_dtypes('float16').columns
df[float_cols] = df[float_cols].swifter.apply(
    lambda x: x.fillna(x.mode()[0])
)

df[float_cols] = df[float_cols].astype('int8')

int_cols = df.select_dtypes('int8').drop('nu_idade_n', axis=1).columns
df[int_cols] = df[int_cols].astype('category')

Pandas Apply: 0it [00:00, ?it/s]

### Feature selection to remove unimportant features

In [110]:
#Excluding text columns
predictors = df.drop('classi_fin', axis=1)
predictors = predictors.drop(predictors.select_dtypes('object'), axis=1)
response = df['classi_fin']

X_train, X_val, y_train,y_val = train_test_split(
    predictors, response, random_state=40, stratify=response
)

In [114]:
numerical_cols = predictors.select_dtypes(['int8']).columns.tolist()
categorical_cols = predictors.select_dtypes(['category']).columns.tolist()

categorical_imputer = ColumnTransformer(
    [('cat_imputer', SimpleImputer(strategy='most_frequent'), categorical_cols)],
    remainder='passthrough'
)

X_train = pd.DataFrame(
    categorical_imputer.fit_transform(X_train),
    columns=predictors.columns
)
X_train[categorical_cols] = X_train[categorical_cols].astype('category')

X_val = pd.DataFrame(
    categorical_imputer.transform(X_val),
    columns=predictors.columns
)
X_val[categorical_cols] = X_val[categorical_cols].astype('category')

categorical_transformer = Pipeline(
    steps=[
        ('encoder', CountFrequencyEncoder(encoding_method='frequency'))
    ]
)

numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('encoder', MinMaxScaler())
    ]    
)

preprocessor = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]   
)

X_train_transformed = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=predictors.columns
)

X_test_transformed = pd.DataFrame(
    preprocessor.transform(X_val),
    columns=predictors.columns
)



# EDA

In [17]:
my_report = sv.analyze(df)
my_report.show_html()

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Mindmap hypothesis

1. Persons who did not take vaccine are the highest number of covid cases
2. Persons who live in urban areas has a higher proportion of covid cases
3. Persons who work with ave and suinos animals has a higher proportion respiratory diseases
4. People who has a older age has a higher tendency to respiratory disease
5. People who smoke did not have any respiratory disease
6. colder weeks has a higher proportion of respiratory disease
7. People who tomography suggested covid really had covid
8. People who took antiviral had less severe symptoms