In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
df = pd.read_csv("titanic/train.csv")

In [3]:
imputation_methods = {
    "Média": SimpleImputer(strategy='mean'),
    "Moda": SimpleImputer(strategy='most_frequent'),
    "KNNImputer": KNNImputer(n_neighbors=5)
}

In [4]:
results = {}

In [5]:
for method_name, imputer in imputation_methods.items():
    df_copy = df.copy()
    df_copy.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'], inplace=True)
    
    df_copy[['Age']] = imputer.fit_transform(df_copy[['Age']])
    df_copy[['Embarked']] = df_copy[['Embarked']].fillna(df_copy['Embarked'].mode()[0])

    label_encoder = LabelEncoder()
    df_copy['Sex'] = label_encoder.fit_transform(df_copy['Sex'])
    df_copy['Embarked'] = label_encoder.fit_transform(df_copy['Embarked'])

    X = df_copy.drop(columns=['Survived'])
    y = df_copy['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_search = BayesSearchCV(
        RandomForestClassifier(random_state=42),
        {
            'n_estimators': (50, 300),
            'max_depth': (1, 20),
            'min_samples_split': (2, 20),
            'min_samples_leaf': (1, 20)
        },
        n_iter=30, cv=5, n_jobs=-1, random_state=42
    )

    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    rf_preds = rf_best.predict(X_test)

    results[method_name] = classification_report(y_test, rf_preds, output_dict=True)
    

In [6]:
for method_name, metrics in results.items():
    print(f"\nMétodo de Imputação: {method_name}")
    print("Precisão:", metrics['weighted avg']['precision'])
    print("Recall:", metrics['weighted avg']['recall'])
    print("F1-score:", metrics['weighted avg']['f1-score'])


Método de Imputação: Média
Precisão: 0.8324681531286563
Recall: 0.8324022346368715
F1-score: 0.8306889154103546

Método de Imputação: Moda
Precisão: 0.7987081005586593
Recall: 0.7988826815642458
F1-score: 0.7961666703468986

Método de Imputação: KNNImputer
Precisão: 0.8324681531286563
Recall: 0.8324022346368715
F1-score: 0.8306889154103546
