In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [8]:
# Carregando a base Titanic
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")


In [9]:
# Tratamento inicial dos dados
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
df.dropna(subset=['Embarked'], inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Separando features e labels
X = df.drop(columns=['Survived'])
y = df['Survived']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['Embarked'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


In [10]:
# Divisão treino/teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### QUESTÃO 1: OTIMIZAÇÃO DE HIPERPARÂMETROS

In [12]:
param_grid_rf = {'n_estimators': (10, 200), 'max_depth': (1, 20)}
param_grid_dt = {'max_depth': (1, 20), 'min_samples_split': (2, 10)}

rf_search = BayesSearchCV(RandomForestClassifier(), param_grid_rf, n_iter=30, cv=5)
dt_search = BayesSearchCV(DecisionTreeClassifier(), param_grid_dt, n_iter=30, cv=5)

rf_search.fit(X_train, y_train)
dt_search.fit(X_train, y_train)

# Melhores modelos
rf_best = rf_search.best_estimator_
dt_best = dt_search.best_estimator_

# Avaliação
y_pred_rf = rf_best.predict(X_test)
y_pred_dt = dt_best.predict(X_test)

print("Random Forest - Acurácia:", accuracy_score(y_test, y_pred_rf))
print("Árvore de Decisão - Acurácia:", accuracy_score(y_test, y_pred_dt))



Random Forest - Acurácia: 0.8258426966292135
Árvore de Decisão - Acurácia: 0.8202247191011236


### QUESTÃO 2: BALANCEAMENTO DE DADOS

In [14]:
# Imputação de valores ausentes antes de aplicar SMOTE
imp_mean = SimpleImputer(strategy='mean')
X_train_imputed = imp_mean.fit_transform(X_train)

# SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train_imputed, y_train)

# TomekLinks
tomek = TomekLinks()
X_res_tl, y_res_tl = tomek.fit_resample(X_train_imputed, y_train)

# RandomUnderSampler
under = RandomUnderSampler()
X_res_under, y_res_under = under.fit_resample(X_train_imputed, y_train)

# ADASYN (Substituto para DSTO-GAN)
adasyn = ADASYN()
X_res_adasyn, y_res_adasyn = adasyn.fit_resample(X_train_imputed, y_train)

# Avaliação com modelos balanceados
for name, X_bal, y_bal in zip(["SMOTE", "TomekLinks", "RandomUnderSampler", "ADASYN"],
                               [X_res, X_res_tl, X_res_under, X_res_adasyn],
                               [y_res, y_res_tl, y_res_under, y_res_adasyn]):
    model = RandomForestClassifier().fit(X_bal, y_bal)
    y_pred = model.predict(X_test)
    print(f"{name} - Precisão:", precision_score(y_test, y_pred))
    print(f"{name} - Recall:", recall_score(y_test, y_pred))
    print(f"{name} - F1-Score:", f1_score(y_test, y_pred))



SMOTE - Precisão: 0.75
SMOTE - Recall: 0.8260869565217391
SMOTE - F1-Score: 0.7862068965517242




TomekLinks - Precisão: 0.7307692307692307
TomekLinks - Recall: 0.8260869565217391
TomekLinks - F1-Score: 0.7755102040816326




RandomUnderSampler - Precisão: 0.6741573033707865
RandomUnderSampler - Recall: 0.8695652173913043
RandomUnderSampler - F1-Score: 0.759493670886076
ADASYN - Precisão: 0.7402597402597403
ADASYN - Recall: 0.8260869565217391
ADASYN - F1-Score: 0.7808219178082192




### QUESTÃO 3: IMPUTAÇÃO DE DADOS AUSENTES

In [15]:
# Criando dados com valores ausentes para simulação
X_missing = X.copy()
X_missing.loc[np.random.randint(0, X.shape[0], 20), 'Age'] = np.nan

# Imputação por Média
imp_mean = SimpleImputer(strategy='mean')
X_imputed_mean = imp_mean.fit_transform(X_missing)

# Imputação por Moda
imp_mode = SimpleImputer(strategy='most_frequent')
X_imputed_mode = imp_mode.fit_transform(X_missing)

# Imputação por KNN
imp_knn = KNNImputer(n_neighbors=3)
X_imputed_knn = imp_knn.fit_transform(X_missing)

# Avaliação
models = {
    "Média": X_imputed_mean,
    "Moda": X_imputed_mode,
    "KNN": X_imputed_knn
}

for name, X_imp in models.items():
    model = RandomForestClassifier().fit(X_imp, y)
    y_pred = model.predict(X_test)
    print(f"{name} - Acurácia:", accuracy_score(y_test, y_pred))



Média - Acurácia: 0.9775280898876404




Moda - Acurácia: 0.9719101123595506
KNN - Acurácia: 0.9719101123595506


