In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

In [3]:

df = pd.read_csv("titanic/train.csv")


In [4]:

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [5]:

label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])


In [6]:

X = df.drop(columns=['Survived'])
y = df['Survived']


In [7]:

samplers = {
    "Original": (X, y),
    "SMOTE": SMOTE().fit_resample(X, y),
    "TomekLinks": TomekLinks().fit_resample(X, y),
    "RandomUnderSampler": RandomUnderSampler().fit_resample(X, y)
}


In [8]:

results = {}

In [9]:

for method, (X_bal, y_bal) in samplers.items():
    X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, random_state=42)
    
    rf_search = BayesSearchCV(
        RandomForestClassifier(random_state=42),
        {
            'n_estimators': (50, 300),
            'max_depth': (1, 20),
            'min_samples_split': (2, 20),
            'min_samples_leaf': (1, 20)
        },
        n_iter=30, cv=5, n_jobs=-1, random_state=42
    )
    
    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    rf_preds = rf_best.predict(X_test)
    
    results[method] = classification_report(y_test, rf_preds, output_dict=True)

In [10]:

for method, metrics in results.items():
    print(f"\nMétodo: {method}")
    print("Precisão:", metrics['weighted avg']['precision'])
    print("Recall:", metrics['weighted avg']['recall'])
    print("F1-score:", metrics['weighted avg']['f1-score'])


Método: Original
Precisão: 0.8230219351142041
Recall: 0.8212290502793296
F1-score: 0.8181737439049303

Método: SMOTE
Precisão: 0.8636363636363636
Recall: 0.8636363636363636
F1-score: 0.8636363636363636

Método: TomekLinks
Precisão: 0.880488186512283
Recall: 0.8795180722891566
F1-score: 0.877775251951201

Método: RandomUnderSampler
Precisão: 0.8102189781021898
Recall: 0.8102189781021898
F1-score: 0.8102189781021898
