In [None]:
import pandas as pd
import numpy as np
import warnings
import os

from xgboost import plot_tree
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style("white")

plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.unicode_minus'] = False

# ===================================================================
# Funções de Auxílio
# ===================================================================

def is_number(s):
    """Verifica se um valor pode ser convertido para número."""
    try:
        float(s)
        return True
    except (ValueError, TypeError):
        return False

def col_miss(df):
    """Calcula a porcentagem de valores ausentes por coluna."""
    col_missing_df = df.isnull().sum(axis=0).reset_index()
    col_missing_df.columns = ['col', 'missing_count']
    col_missing_df['missing_part'] = col_missing_df['missing_count'] / len(df)
    return col_missing_df.sort_values(by='missing_count', ascending=False)

def plot_roc(labels, predict_prob, model_name, fig, labels_name, k):
    """Plota a curva ROC."""
    false_positive_rate, true_positive_rate, thresholds = roc_curve(labels, predict_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    
    line_list = ['--', '-']
    ax = fig.add_subplot(111)
    ax.plot(false_positive_rate, true_positive_rate, line_list[k % 2], linewidth=1 + (1 - k / 5),
            label=f'{model_name} AUC = {roc_auc:.4f}')
    
    plt.title('Curva ROC', fontsize=20)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.ylabel('Taxa de Verdadeiro Positivo (TPR)', fontsize=14)
    plt.xlabel('Taxa de Falso Positivo (FPR)', fontsize=14)
    labels_name.append(f'{model_name} AUC = {roc_auc:.4f}')
    return labels_name

def show_confusion_matrix(validations, predictions):
    """Exibe a matriz de confusão."""
    LABELS = ['Sobrevivência', 'Óbito']
    matrix = confusion_matrix(validations, predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(matrix,
                cmap='coolwarm',
                linecolor='white',
                linewidths=1,
                xticklabels=LABELS,
                yticklabels=LABELS,
                annot=True,
                fmt='d')
    plt.title('Matriz de Confusão')
    plt.ylabel('Rótulo Verdadeiro')
    plt.xlabel('Rótulo Previsto')
    plt.show()

# ===================================================================
# Funções Principais de Pré-processamento e Modelagem
# ===================================================================

def data_preprocess(train_path, test_path, target_col='EVOLUCAO'):
    """
    Carrega e pré-processa os dados de treino e teste.
    """
    
    if not os.path.exists(train_path) or not os.path.exists(test_path):
        print("Erro: Arquivos 'train.csv' ou 'test.csv' não encontrados no caminho especificado.")
        print(f"Por favor, verifique se a pasta 'data' está no mesmo diretório do seu script e contém os arquivos: {os.getcwd()}/data/")
        return None, None, None

    # --- ALTERAÇÃO 1: low_memory=False para resolver problemas de tipo de dado
    train_df = pd.read_csv(train_path, low_memory=False)
    test_df = pd.read_csv(test_path, low_memory=False)
    
    # --- ALTERAÇÃO 2: Downcasting de tipos de dados para economizar memória ---
    def downcast_df_types(df):
        for col in df.columns:
            if df[col].dtype == 'int64':
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif df[col].dtype == 'float64':
                df[col] = pd.to_numeric(df[col], downcast='float')
        return df

    train_df = downcast_df_types(train_df)
    test_df = downcast_df_types(test_df)
    
    # Identificar e Otimizar colunas de objeto
    category_cols_to_convert = ['DT_NOTIFIC', 'CS_SEXO', 'ID_MN_RESI', 'SG_UF']
    
    for col in category_cols_to_convert:
        if col in train_df.columns:
            train_df[col] = train_df[col].astype('category')
        if col in test_df.columns:
            test_df[col] = test_df[col].astype('category')
    
    # Identificar e tratar colunas com muitos valores nulos
    train_missing = col_miss(train_df)
    cols_to_drop = train_missing[train_missing['missing_part'] > 0.20]['col'].tolist()
    
    if target_col in cols_to_drop:
        cols_to_drop.remove(target_col)
    
    train_cols_to_drop = [col for col in cols_to_drop if col in train_df.columns]
    test_cols_to_drop = [col for col in cols_to_drop if col in test_df.columns]

    print(f"Colunas removidas devido a mais de 20% de valores ausentes: {cols_to_drop}")
    train_df = train_df.drop(columns=train_cols_to_drop)
    test_df = test_df.drop(columns=test_cols_to_drop)
    
    train_categorical_cols = train_df.select_dtypes(include='category').columns
    test_categorical_cols = test_df.select_dtypes(include='category').columns

    train_df[train_categorical_cols] = train_df[train_categorical_cols].astype('object')
    test_df[test_categorical_cols] = test_df[test_categorical_cols].astype('object')
    
    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    categorical_cols_train = train_df.select_dtypes(include=['object', 'category']).columns
    categorical_cols_test = test_df.select_dtypes(include=['object', 'category']).columns

    train_df = pd.get_dummies(train_df, columns=categorical_cols_train, drop_first=True)
    test_df = pd.get_dummies(test_df, columns=categorical_cols_test, drop_first=True)
    
    train_cols = train_df.drop(columns=[target_col]).columns
    test_cols = test_df.columns
    
    missing_in_test = list(set(train_cols) - set(test_cols))
    for col in missing_in_test:
        test_df[col] = 0
    
    test_df = test_df[train_cols]
    
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col].astype(int)
    
    X_test = test_df
    y_test = None
    
    return X_train, y_train, X_test

def StratifiedKFold_func(x, y, model, num_iter=10, score_type='auc'):
    """Executa a validação cruzada K-Fold estratificada."""
    acc_val = []
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_idx, val_idx in skf.split(x, y):
        x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
        x_val, y_val = x.iloc[val_idx], y.iloc[val_idx]

        model.fit(x_train, y_train)
        
        if score_type == 'auc':
            pred_proba = model.predict_proba(x_val)[:, 1]
            acc_val.append(roc_auc_score(y_val, pred_proba))
        else:
            pred = model.predict(x_val)
            acc_val.append(f1_score(y_val, pred))
    
    return np.mean(acc_val), np.std(acc_val)

def features_selection(X_train, y_train, num_features=10):
    """Seleciona as 10 principais features usando a importância do XGBoost."""
    print("Iniciando a seleção de features...")
    
    model = xgb.XGBClassifier(
        max_depth=4, learning_rate=0.2, reg_lambda=1, n_estimators=150,
        subsample=0.9, colsample_bytree=0.9, random_state=42
    )
    
    model.fit(X_train, y_train)
    
    import_feature = pd.DataFrame({
        'col': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)
    
    print(f"As {num_features} features mais importantes são:")
    print(import_feature.head(num_features))
    
    top_cols = import_feature['col'].head(num_features).tolist()
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x='importance', y='col', data=import_feature.head(num_features))
    plt.title('Importância das Features (XGBoost)', fontsize=16)
    plt.xlabel('Importância Relativa', fontsize=12)
    plt.ylabel('Features', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    return top_cols

def train_and_predict(X_train, y_train, X_test, selected_cols):
    """
    Treina o modelo final com as features selecionadas e gera as previsões para o arquivo de teste.
    """
    print("Iniciando o treinamento do modelo final e a previsão...")
    
    X_train_sel = X_train[selected_cols]
    X_test_sel = X_test[selected_cols]

    final_model = xgb.XGBClassifier(
        max_depth=4, learning_rate=0.2, reg_lambda=1, n_estimators=150,
        subsample=0.9, colsample_bytree=0.9, random_state=42
    )
    
    final_model.fit(X_train_sel, y_train)
    
    predictions = final_model.predict(X_test_sel)
    
    submission_df = pd.DataFrame({'EVOLUCAO': predictions})
    submission_df.to_csv('submission.csv', index=False)
    
    print("Arquivo 'submission.csv' gerado com sucesso!")
    
    return final_model, submission_df

# ===================================================================
# Execução Principal
# ===================================================================

if __name__ == '__main__':
    
    TRAIN_PATH = 'data/train.csv'
    TEST_PATH = 'data/test.csv'
    
    X_train, y_train, X_test = data_preprocess(TRAIN_PATH, TEST_PATH)

    if X_train is not None and y_train is not None and X_test is not None:
        selected_cols = features_selection(X_train, y_train)

        print("\n--- Validação Cruzada com o Modelo e Features Selecionadas ---")
        
        final_model = xgb.XGBClassifier(random_state=42)
        mean_f1, std_f1 = StratifiedKFold_func(X_train[selected_cols], y_train, final_model, score_type='f1')
        
        print(f"F1-Score Médio (validação cruzada): {mean_f1:.4f} (+/- {std_f1:.4f})")

        model, submission = train_and_predict(X_train, y_train, X_test, selected_cols)
        
        print("\n--- Previsões geradas com sucesso. Seu arquivo de submissão está pronto! ---")

Colunas removidas devido a mais de 20% de valores ausentes: ['OBES_IMC', 'PUERPERA', 'SIND_DOWN', 'HEPATICA', 'IMUNODEPRE', 'RENAL', 'PNEUMOPATI', 'OBESIDADE', 'FADIGA', 'DIABETES', 'CARDIOPATI', 'FATOR_RISC', 'CS_ESCOL_N', 'VOMITO', 'DIARREIA', 'GARGANTA', 'VACINA']
Iniciando a seleção de features...


MemoryError: Unable to allocate 1.90 MiB for an array with shape (498320,) and data type float32