# Imports and helper functions

In [17]:
import pandas as pd
from swifter import swifter
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#import awswrangler as wr
import sweetviz as sv
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.inspection import permutation_importance
from sklearn import model_selection, metrics
from xgboost import XGBClassifier
import lightgbm as lgbm


def data_description(df):
    print('Variables:\n\n{}'.format(df.dtypes), end='\n\n')
    print('Number of rows {}'.format(df.shape[0]), end='\n\n')
    print('Number of columns {}'.format(df.shape[1]), end='\n\n')
    print('NA analysis'.format(end='\n'))
    for i in df.columns:
        print('column {}: {} {}'.format(i,df[i].isna().any(), df[i].isna().sum()))

# def consult_table_athena(database, table):
#     wr.config.aws_profile = 'default'
#     wr.config.region = 'us-east-1'

#     query = f"SELECT * FROM {database}.{table}"

#     df = wr.athena.read_sql_query(query, database=database)

#     return df


def unique_values_columns(df):
    """
    Display unique values for each object (or string) column in a DataFrame.
    
    Parameters:
    - df (DataFrame): Input DataFrame
    
    Returns:
    - dict: A dictionary with column names as keys and unique values as lists.
    """
    
    # Filter out only object or string type columns
    object_cols = df.select_dtypes(include=['object']).columns
    
    # Get unique values for each object column
    unique_values = {col: df[col].unique().tolist() for col in object_cols}
    
    return unique_values

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Loading Data

In [2]:
# DATABASE = 'respiratory_db'
# TABLE = 'table_respiratory_train_data'
# df = consult_table_athena(DATABASE, TABLE)

# df.to_parquet('train.parquet', index=False)

# df = pd.read_csv('../data/raw/train.csv', low_memory=False)
# df.to_parquet('train.parquet', index=False)

In [18]:
df_raw = pd.read_parquet('../data/raw/train.parquet')

## Data description

In [19]:
df_raw.columns = df_raw.columns.str.lower()

In [4]:
df_raw['classi_fin'].value_counts().sort_index(ascending=True)

classi_fin
1     15628
2     31437
3      5935
4    380410
5    867570
Name: count, dtype: int64

In [20]:
class1 = df_raw[df_raw['classi_fin'] == 1]
class2 = df_raw[df_raw['classi_fin'] == 2]
class3 = df_raw[df_raw['classi_fin'] == 3]
class4 = df_raw[df_raw['classi_fin'] == 4].sample(31437)
class5 = df_raw[df_raw['classi_fin'] == 5].sample(31437)
df = pd.concat([class1, class2, class3, class4, class5], ignore_index=True)
del(df_raw)

In [9]:
data_description(df)

Variables:

sem_not         int64
sem_pri         int64
sg_uf_not      object
id_regiona     object
co_regiona    float64
id_municip     object
co_mun_not      int64
cs_sexo        object
nu_idade_n      int64
tp_idade        int64
cod_idade      object
cs_gestant      int64
cs_raca         int64
cs_escol_n    float64
sg_uf          object
cs_zona       float64
surto_sg      float64
nosocomial    float64
ave_suino     float64
febre         float64
tosse         float64
garganta      float64
dispneia      float64
desc_resp     float64
saturacao     float64
diarreia      float64
vomito        float64
outro_sin     float64
outro_des      object
puerpera      float64
fator_risc      int64
cardiopati    float64
hematologi    float64
sind_down     float64
hepatica      float64
asma          float64
diabetes      float64
neurologic    float64
pneumopati    float64
imunodepre    float64
renal         float64
obesidade     float64
obes_imc       object
out_morbi     float64
morb_desc      objec

In [21]:
df = reduce_mem_usage(df)

Mem. usage decreased to 26.19 Mb (61.0% reduction)


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sem_not,115874.0,23.585023,15.369219,1.0,11.0,22.0,36.0,52.0
sem_pri,115874.0,24.024622,15.726942,1.0,11.0,21.0,37.0,52.0
co_regiona,102583.0,,,1331.0,1342.0,1382.0,1519.0,6256.0
co_mun_not,115874.0,352287.508785,85783.835254,110001.0,311510.0,353800.0,410940.0,530010.0
nu_idade_n,115874.0,39.512721,31.342504,-9.0,5.0,42.0,68.0,117.0
tp_idade,115874.0,2.810225,0.421424,1.0,3.0,3.0,3.0,3.0
cs_gestant,115874.0,5.831196,0.805223,0.0,6.0,6.0,6.0,9.0
cs_raca,115874.0,3.560445,2.882355,1.0,1.0,4.0,4.0,9.0
cs_escol_n,75078.0,,0.0,0.0,2.0,5.0,9.0,9.0
cs_zona,115874.0,,0.0,1.0,1.0,1.0,1.0,9.0


### Data Cleaning / NA analysis / Outliers analysis

In [22]:
# Sexo have Male, F and I, lets take off the I value has there are just 188 lines.
df = df[df['cs_sexo'] != 'I']
df['cs_sexo'] = df['cs_sexo'].astype('category')



# Negative ages are excluded
df = df[~df['nu_idade_n'] <= 0]



In [23]:
# Remove demographic categories

df = df.drop(
    [
    'sg_uf_not',
    'id_regiona',
    'co_regiona',
    'id_municip',
    'co_mun_not',
    'sg_uf',
    'cod_idade',
    'cs_escol_n'
    ], axis=1
)

In [24]:
float_cols = df.select_dtypes('float16').columns
df[float_cols] = df[float_cols].swifter.apply(
    lambda x: x.fillna(x.mode()[0])
)

df[float_cols] = df[float_cols].astype('int8')

int_cols = df.select_dtypes('int8').drop('nu_idade_n', axis=1).columns
df[int_cols] = df[int_cols].astype('category')

Pandas Apply:   0%|          | 0/44 [00:00<?, ?it/s]

### Feature selection to remove unimportant features

In [35]:
#Excluding text columns
predictors = df.drop('classi_fin', axis=1)
predictors = predictors.drop(predictors.select_dtypes('object'), axis=1)
response = df['classi_fin']

X_train, X_val, y_train,y_val = train_test_split(
    predictors, response, random_state=50, stratify=response
)

In [36]:
numerical_cols = predictors.select_dtypes(['int8']).columns.tolist()
categorical_cols = predictors.select_dtypes(['category']).columns.tolist()

categorical_imputer = ColumnTransformer(
    [('cat_imputer', SimpleImputer(strategy='most_frequent'), categorical_cols)],
    remainder='passthrough'
)

X_train = pd.DataFrame(
    categorical_imputer.fit_transform(X_train),
    columns=predictors.columns
)
X_train[categorical_cols] = X_train[categorical_cols].astype('category')

X_val = pd.DataFrame(
    categorical_imputer.transform(X_val),
    columns=predictors.columns
)
X_val[categorical_cols] = X_val[categorical_cols].astype('category')

categorical_transformer = Pipeline(
    steps=[
        ('encoder', CountFrequencyEncoder(encoding_method='frequency'))
    ]
)

numerical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('encoder', MinMaxScaler())
    ]    
)

preprocessor = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]   
)

X_train_transformed = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=predictors.columns
)

X_test_transformed = pd.DataFrame(
    preprocessor.transform(X_val),
    columns=predictors.columns
)

encoder = LabelEncoder()
y_train= encoder.fit_transform(y_train)
y_val = encoder.transform(y_val)

In [43]:
classifiers = [
    RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced_subsample'),
    XGBClassifier(n_jobs=-1, random_state=42,objective='multi:softax', max_delta_step=1),
    lgbm.LGBMClassifier(n_jobs=-1,  random_state=42, class_weight='balanced')
]

for classifier in classifiers:
    pipeline_1 = Pipeline(
        steps= [
        ('feature_selection', SelectFromModel(estimator=classifier))
        ]
    )
    pipeline_2 = Pipeline(
        steps= [
        ('feature_selection', RFE(estimator=classifier))
        ]
    )
    
    permutation_score = permutation_importance(
        classifier.fit(X_train_transformed,y_train), X_test_transformed, y_val,
        random_state=42, scoring='f1_weighted', n_repeats=10
    )
    
    importance = pd.DataFrame(
        {'features':X_train_transformed.columns, 
        'f1_weighted':permutation_score['importances_mean']}).sort_values(by='f1_weighted', ascending=False
    )
    
    print(
        'model: {} \n features selected based on feature importance:{} \n\n'.format(pipeline_1['feature_selection'].estimator,
        pipeline_1.fit(X_train_transformed,y_train).get_feature_names_out(input_features=None)) 
        )
    print(
        'model: {} \n features_selected based on RFE:{} \n\n'.format(pipeline_2['feature_selection'].estimator,
        pipeline_2.fit(X_train_transformed,y_train).get_feature_names_out(input_features=None))
        )
    print(importance, '\n\n\n')
    
#Features choosen to continue
feature_selected = [
    'sem_not' 'sem_pri' 'mae_vac' 'antiviral' 'suport_ven' 'amostra'
    'perd_olft' 'perd_pala' 'vacina_cov' 'dose_1_cov' 'dose_2_cov', 
    'tp_idade', 'cs_gestant', 'obesidade', 'hospital', 'uti', 'tp_antivir'
    'fnt_in_cov' 'delta_uti'
]

KeyboardInterrupt: 

# EDA

In [17]:
my_report = sv.analyze(df)
my_report.show_html()

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Mindmap hypothesis

1. Persons who did not take vaccine are the highest number of covid cases
2. Persons who live in urban areas has a higher proportion of covid cases
3. Persons who work with ave and suinos animals has a higher proportion respiratory diseases
4. People who has a older age has a higher tendency to respiratory disease
5. People who smoke did not have any respiratory disease
6. colder weeks has a higher proportion of respiratory disease
7. People who tomography suggested covid really had covid
8. People who took antiviral had less severe symptoms