# Imports and helper functions

In [None]:
import pandas as pd
from swifter import swifter
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import awswrangler as wr
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.inspection import permutation_importance
from sklearn import model_selection, metrics
import sagemaker
import awscli
import os
import s3fs
import joblib
import boto3
import io



AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
region = os.getenv('region')
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=region
)


s3 = s3fs.S3FileSystem(
    anon=False, key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY)


def data_description(df):
    print('Variables:\n\n{}'.format(df.dtypes), end='\n\n')
    print('Number of rows {}'.format(df.shape[0]), end='\n\n')
    print('Number of columns {}'.format(df.shape[1]), end='\n\n')
    print('NA analysis'.format(end='\n'))
    for i in df.columns:
        print('column {}: {} {}'.format(i,df[i].isna().any(), df[i].isna().sum()))

def consult_table_athena(database, table):
    wr.config.aws_profile = 'default'
    wr.config.region = 'us-east-1'

    query = f"SELECT * FROM {database}.{table}"

    df = wr.athena.read_sql_query(query, database=database)

    return df


def unique_values_columns(df):
    """
    Display unique values for each object (or string) column in a DataFrame.
    
    Parameters:
    - df (DataFrame): Input DataFrame
    
    Returns:
    - dict: A dictionary with column names as keys and unique values as lists.
    """
    
    # Filter out only object or string type columns
    object_cols = df.select_dtypes(include=['object']).columns
    
    # Get unique values for each object column
    unique_values = {col: df[col].unique().tolist() for col in object_cols}
    
    return unique_values

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Loading Data

In [None]:
# Loading raw data from athena
# Balancing classes
# Separate train and test data


# DATABASE = 'respiratory_db'
# TABLE = 'table_respiratory_traintrain_data'
# df = consult_table_athena(DATABASE, TABLE)

# df.to_parquet('data.parquet', index=False)

# df_raw['classi_fin'].value_counts().sort_index(ascending=True)
# class1 = df_raw[df_raw['classi_fin'] == 1].sample(31437, random_state=42)
# class2 = df_raw[df_raw['classi_fin'] == 2]
# class3 = df_raw[df_raw['classi_fin'] == 3]
# class4 = df_raw[df_raw['classi_fin'] == 4].sample(31437, random_state=42)
# class5 = df_raw[df_raw['classi_fin'] == 5].sample(31437, random_state=42)
# df_raw = pd.concat([class1, class2, class3, class4, class5], ignore_index=True)

# train, test = train_test_split(df_raw, test_size=0.15, random_state=42)

# train.to_parquet('train.parquet', index=False)
# test.to_parquet('test.parquet', index=False)

In [None]:
df = pd.read_parquet('../data/train.parquet')

## Data description

In [None]:
df.columns = df.columns.str.lower()

In [None]:
data_description(df)

In [None]:
df = reduce_mem_usage(df)

In [None]:
df.describe().T

### Data Cleaning / NA analysis / Outliers analysis

In [None]:
# Sexo have Male, F and I, lets take off the I value has there are just 188 lines.
df = df.loc[df['cs_sexo'] != 'I']
df['cs_sexo'] = df['cs_sexo'].astype('category')

df = df.drop('delta_uti', axis=1)

# Negative ages are excluded
df = df[~df['nu_idade_n'] <= 0]

# Remove demographic categories

df = df.drop(
    [
    'sg_uf_not',
    'id_regiona',
    'co_regiona',
    'id_municip',
    'co_mun_not',
    'sg_uf',
    'cod_idade',
    'cs_escol_n'
    ], axis=1
)

df['tp_amostra'].replace(df['tp_amostra'].max(),df['tp_amostra'].mode()[0], inplace=True)
df['dor_abd'].replace(df['dor_abd'].max(),df['dor_abd'].mode()[0], inplace=True)
df['perd_olft'].replace(df['perd_olft'].max(),df['perd_olft'].mode()[0], inplace=True)

In [None]:
float_cols = df.select_dtypes(['float16','float32','int64']).columns
df[float_cols] = df[float_cols].swifter.apply(
    lambda x: x.fillna(x.mode()[0])
)

df[float_cols] = df[float_cols].astype('int8')

int_cols = df.select_dtypes(['int64','int8']).drop(['sem_not','sem_pri','nu_idade_n'], axis=1).columns
df[int_cols] = df[int_cols].astype('category')

### Feature selection to remove unimportant features

In [None]:
# #Excluding text columns
# predictors = df.drop('classi_fin', axis=1)
# predictors = predictors.drop(predictors.select_dtypes(['object','string']), axis=1)
# response = df['classi_fin']

# X_train, X_val, y_train,y_val = train_test_split(
#     predictors, response, random_state=50, stratify=response
# )

# numerical_cols = predictors.select_dtypes(['int8']).columns.tolist()
# categorical_cols = predictors.select_dtypes(['category']).columns.tolist()

# categorical_imputer = ColumnTransformer(
#     [('cat_imputer', SimpleImputer(strategy='most_frequent'), categorical_cols)],
#     remainder='drop'
# )


# X_train_transformed = pd.DataFrame(
#     categorical_imputer.fit_transform(X_train),
#     columns=categorical_cols,
#     index=X_train.index
# )
# X_train_transformed = pd.concat([X_train_transformed,X_train[numerical_cols]],axis=1)
# X_train_transformed[categorical_cols] = X_train_transformed[categorical_cols].astype('category')

# X_val_transformed = pd.DataFrame(
#     categorical_imputer.transform(X_val),
#     columns=categorical_cols,
#     index=X_val.index
# )
# X_val_transformed = pd.concat([X_val_transformed,X_val[numerical_cols]],axis=1)
# X_val_transformed[categorical_cols] = X_val_transformed[categorical_cols].astype('category')

# categorical_transformer = Pipeline(
#     steps=[
#         ('encoder', CountFrequencyEncoder(encoding_method='frequency'))
#     ]
# )

# numerical_transformer = Pipeline(
#     steps=[
#         ('encoder', MinMaxScaler())
#     ]    
# )

# preprocessor = ColumnTransformer(
#     [
#         ('cat', categorical_transformer, categorical_cols),
#         ('num', numerical_transformer, numerical_cols)
#     ]   
# )

# X_train_transformed = pd.DataFrame(
#     preprocessor.fit_transform(X_train),
#     columns=predictors.columns,
#     index=X_train_transformed.index
# )

# X_val_transformed = pd.DataFrame(
#     preprocessor.transform(X_val),
#     columns=predictors.columns,
#     index=X_val_transformed.index
# )

# encoder = LabelEncoder()
# y_train = encoder.fit_transform(y_train)
# y_val = encoder.transform(y_val)

# classifiers = [
#     RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced_subsample'),
#     XGBClassifier(n_jobs=-1, random_state=42,objective='multi:softax', max_delta_step=1),
#     lgbm.LGBMClassifier(n_jobs=-1,  random_state=42, class_weight='balanced')
# ]

# for classifier in classifiers:
#     pipeline_1 = Pipeline(
#         steps= [
#         ('feature_selection', SelectFromModel(estimator=classifier))
#         ]
#     )
#     pipeline_2 = Pipeline(
#         steps= [
#         ('feature_selection', RFE(estimator=classifier))
#         ]
#     )
    
#     permutation_score = permutation_importance(
#         classifier.fit(X_train_transformed,y_train), X_val_transformed, y_val,
#         random_state=42, scoring='f1_weighted', n_repeats=10
#     )

#     importance = pd.DataFrame(
#         {'features':X_train_transformed.columns, 
#         'f1_weighted':permutation_score['importances_mean']}).sort_values(by='f1_weighted', ascending=False
#     )

#     print(
#         'model: {} \n features selected based on feature importance:{} \n\n'.format(pipeline_1['feature_selection'].estimator,
#         pipeline_1.fit(X_train_transformed,y_train).get_feature_names_out(input_features=None)) 
#         )
#     print(
#         'model: {} \n features_selected based on RFE:{} \n\n'.format(pipeline_2['feature_selection'].estimator,
#         pipeline_2.fit(X_train_transformed,y_train).get_feature_names_out(input_features=None))
#         )
#     print(importance, '\n\n\n')



#Features choosen to continue
feature_selected = [
    'id', 'sem_pri', 'nu_idade_n', 'saturacao', 'antiviral',
    'tp_antivir', 'hospital', 'dose_2_cov', 'dose_ref', 'classi_fin',
    'fnt_in_cov', 'uti','raiox_res', 'dor_abd', 'perd_olft', 'tomo_res',
    'cs_raca', 'cs_zona', 'perd_pala', 'dose_1_cov','vacina_cov'
]

df = df[feature_selected]

# EDA

# Response analysis

In [None]:
df_aux = df['classi_fin'].apply(
    lambda x: 'SARS by influenza' if x == 1
    else 'SARS by other respiratory virus' if x == 2
    else 'SARS by another etiological agent' if x == 3
    else 'unspecified SARS' if x == 4
    else 'SARS by covid-19'
)

# Data preparation and feature selection

In [None]:
#Replace 9 values to mode of each column
df_category = (
    df.select_dtypes(include='category')
    .columns.to_list()
)
column_modes = df[df_category].mode().iloc[0]
for col_name in df_category:
    df[col_name] = df[col_name].replace(9, column_modes[col_name])

In [None]:
X = df.drop(['id','classi_fin'], axis=1)
y = df['classi_fin']

encoder = LabelEncoder()
y = pd.Series(encoder.fit_transform(y), index=y.index)


numerical_cols = X.select_dtypes(['int8']).columns.tolist()
categorical_cols = X.select_dtypes(['category']).columns.tolist()
columns_name = categorical_cols + numerical_cols

skf = StratifiedKFold(n_splits=5, shuffle=True)
role = "arn:aws:iam::513734873949:role/FULL_SAGEMAKER"
bucket = 'sagemaker-traintest-respiratory-classification'
# prefix1 = 'train'
# prefix2 = 'val'
prefixprep = 'preprocessor'
prefixestimat = 'estimator'
sagemaker_session = sagemaker.Session()

hyperparameter_ranges = {
    'eta': sagemaker.tuner.CategoricalParameter([0.1, 0.5]),
    'max_depth': sagemaker.tuner.CategoricalParameter([2, 9]),
    'gamma': sagemaker.tuner.CategoricalParameter([3, 10]),
    'min_child_weight': sagemaker.tuner.CategoricalParameter([8, 15]),
    'subsample': sagemaker.tuner.CategoricalParameter([0.5, 0.7])
}

cv_metric = []
cv_best_estimator = []

for i, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    categorical_transformer = Pipeline(
        steps=[
            ('encoder', CountFrequencyEncoder(encoding_method='frequency'))
        ]
    )

    numerical_transformer = Pipeline(
        steps=[
            ('encoder', MinMaxScaler())
        ]    
    )

    preprocessor = ColumnTransformer(
        [
            ('cat', categorical_transformer, categorical_cols),
            ('num', numerical_transformer, numerical_cols)
        ]   
    )
    
    X_train = pd.DataFrame(
        preprocessor.fit_transform(X_train),
        columns=columns_name,
        index=X_train.index
    )

    X_val = pd.DataFrame(
        preprocessor.transform(X_val),
        columns=columns_name,
        index=X_val.index
    )
    
    # Sending to S3
    train_data = pd.concat([y_train,X_train], axis=1)
    val_data = pd.concat([y_val,X_val], axis=1)
    train_data = train_data.rename(columns={0:'classi_fin'})
    val_data = val_data.rename(columns={0:'classi_fin'})

    prefix1 = 'train_fold_{}'.format(i)
    prefix2 = 'val_fold_{}'.format(i)
    
    train_data.to_csv('train_processed_fold_{}.csv'.format(i), header=False, index=False)
    val_data.to_csv('val_processed_fold_{}.csv'.format(i), header=False, index=False)


    

    train_data_path = sagemaker_session.upload_data(path='train_processed_fold_{}.csv'.format(i), bucket=bucket, key_prefix=prefix1)
    val_data_path = sagemaker_session.upload_data(path='val_processed_fold_{}.csv'.format(i), bucket=bucket, key_prefix=prefix2)

    s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, prefix1), content_type='csv')
    s3_input_val = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, prefix2), content_type='csv')
    
    
    container = sagemaker.image_uris.retrieve(region=sagemaker_session.boto_region_name, framework='xgboost', version='1.7-1')


    estimator = sagemaker.estimator.Estimator(
        container,
        role,
        instance_count=1,
        instance_type='ml.c5.2xlarge',
        output_path='s3://{}/{}/output_fold_{}'.format(bucket, prefixestimat, i),
        sagemaker_session=sagemaker_session,
        hyperparameters= {
            'num_round': 100,
            'num_class': 5,
            'objective':'multi:softprob'
        }
    )
    
    tuner = sagemaker.tuner.HyperparameterTuner(
    estimator=estimator, 
    objective_metric_name='validation:mlogloss', 
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=8,
    max_parallel_jobs=3,
    objective_type='Minimize'
    )

    tuner.fit({'train': s3_input_train, 'validation': s3_input_val})
    
    tuner.wait()
    
    best_training_job_name = tuner.best_training_job()
    
    sagemaker_client = session.client('sagemaker')
    best_job_details = sagemaker_client.describe_training_job(TrainingJobName=best_training_job_name)
    best_hyperparameters = best_job_details['HyperParameters']
    
    best_estimator = sagemaker.estimator.Estimator.attach(best_training_job_name)
    predictor = best_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

    with s3.open('s3://{}/{}/val_processed_fold_{}.csv'.format(bucket, prefix2, i), 'rb') as f:
        X_val = pd.read_csv(f)
    
    
    csv_buffer = io.StringIO()
    X_val.to_csv(csv_buffer, header=False, index=False)
    csv_payload = csv_buffer.getvalue().encode('utf-8')
    predictions = predictor.predict(csv_payload)
    metric = metrics.log_loss(y_val, predictions)
    cv_best_estimator.append(best_hyperparameters)
    cv_metric.append(metric)
    predictor.delete_endpoint()
    
    print(cv_best_estimator)
    print(cv_metric)

# Machine Learning model selection

## XGBoost

In [None]:
numerical_cols = X.select_dtypes(['int8']).columns.tolist()
categorical_cols = X.select_dtypes(['category']).columns.tolist()
columns_name = categorical_cols + numerical_cols

categorical_transformer = Pipeline(
    steps=[
        ('encoder', CountFrequencyEncoder(encoding_method='frequency'))
    ]
)

numerical_transformer = Pipeline(
    steps=[
        ('encoder', MinMaxScaler())
    ]    
)

preprocessor = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]   
)

X_train = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=columns_name,
    index=X_train.index
)

X_val = pd.DataFrame(
    preprocessor.transform(X_val),
    columns=columns_name,
    index=X_val.index
)

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_train = pd.Series(y_train, index=train_index)
y_val = encoder.transform(y_val)
y_val = pd.Series(y_val, index=val_index)

In [None]:
# Create new bucket for processed data
bucket = 'sagemaker-traintest-respiratory-classification'
prefix1 = 'train'
prefix2 = 'test'
prefixestimat = 'estimator'

# # Sending to S3
# train_data = pd.concat([y_train,X_train], axis=1)
# val_data = pd.concat([y_val,X_val], axis=1)
# train_data = train_data.rename(columns={0:'classi_fin'})
# val_data = val_data.rename(columns={0:'classi_fin'})

# train_data.to_csv('train_processed.csv', header=False, index=False)
# val_data.to_csv('validation.csv', header=False, index=False)

In [None]:
sagemaker_session = sagemaker.Session()

# train_data_path = sagemaker_session.upload_data(path='train_processed.csv', bucket=bucket, key_prefix=prefix1)
# val_data_path = sagemaker_session.upload_data(path='validation.csv', bucket=bucket, key_prefix=prefix2)

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, prefix1), content_type='csv')
s3_input_val = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/'.format(bucket, prefix2), content_type='csv')

In [None]:
role = "arn:aws:iam::513734873949:role/FULL_SAGEMAKER"

container = sagemaker.image_uris.retrieve(region=sagemaker_session.boto_region_name, framework='xgboost', version='1.7-1')


estimator = sagemaker.estimator.Estimator(container,
                                          role,
                                          instance_count=1,
                                          instance_type='ml.m4.xlarge',
                                          output_path='s3://{}/{}/output'.format(bucket, prefixestimat),
                                          sagemaker_session=sagemaker_session)

estimator.fit({'train': s3_input_train, 'validation': s3_input_val})
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.large')