# 🧠 Exercício: Naive Bayes e Árvores de Decisão (Caso Titanic)
Este notebook realiza o pré-processamento da base **Titanic** e treina modelos de **Naive Bayes**, **Árvores de Decisão** e **Random Forests**.

Etapas:
1. Pré-processamento da base Titanic
2. Treino do Naive Bayes Gaussiano
3. Treino de duas Árvores de Decisão com parâmetros diferentes
4. Treino de duas Random Forests com tamanhos diferentes
5. Plotagem de uma árvore e análise de desempenho

In [None]:
!pip install seaborn scikit-learn matplotlib pandas numpy --quiet

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

## 📂 1. Carregar a base Titanic

In [None]:
df = sns.load_dataset("titanic")
print("✅ Dataset Titanic carregado com sucesso!")
print(df.head(3))

features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
target = 'survived'
X = df[features]
y = df[target]

## ⚙️ 2. Pré-processamento dos dados

In [None]:
numeric_features = ['age', 'sibsp', 'parch', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

## 🤖 3. Treinar modelos

In [None]:
results = {}

# Naive Bayes
gnb = Pipeline([('preprocess', preprocessor), ('model', GaussianNB())])
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
results['GaussianNB'] = accuracy_score(y_test, y_pred_gnb)

# Árvores
tree_a = Pipeline([('preprocess', preprocessor),
                   ('model', DecisionTreeClassifier(max_depth=3, criterion='gini', random_state=42))])
tree_a.fit(X_train, y_train)
y_pred_tree_a = tree_a.predict(X_test)
results['DecisionTree_A'] = accuracy_score(y_test, y_pred_tree_a)

tree_b = Pipeline([('preprocess', preprocessor),
                   ('model', DecisionTreeClassifier(max_depth=None, criterion='entropy', random_state=42))])
tree_b.fit(X_train, y_train)
y_pred_tree_b = tree_b.predict(X_test)
results['DecisionTree_B'] = accuracy_score(y_test, y_pred_tree_b)

# Random Forests
rf_small = Pipeline([('preprocess', preprocessor),
                     ('model', RandomForestClassifier(n_estimators=50, random_state=42))])
rf_small.fit(X_train, y_train)
y_pred_rf_small = rf_small.predict(X_test)
results['RandomForest_50'] = accuracy_score(y_test, y_pred_rf_small)

rf_large = Pipeline([('preprocess', preprocessor),
                     ('model', RandomForestClassifier(n_estimators=200, random_state=42))])
rf_large.fit(X_train, y_train)
y_pred_rf_large = rf_large.predict(X_test)
results['RandomForest_200'] = accuracy_score(y_test, y_pred_rf_large)

print("🎯 Acurácias:")
for k,v in results.items():
    print(f"{k:20s}: {v:.4f}")

best_tree_key = max([k for k in results if k.startswith('DecisionTree')], key=lambda k: results[k])
best_rf_key = max([k for k in results if k.startswith('RandomForest')], key=lambda k: results[k])

## 📊 4. Matrizes de Confusão

In [None]:
def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(cmap='Blues')
    plt.title(title)
    plt.show()

plot_conf_matrix(y_test, y_pred_gnb, "Confusion Matrix - GaussianNB")
plot_conf_matrix(y_test, y_pred_tree_b, f"Confusion Matrix - {best_tree_key}")
plot_conf_matrix(y_test, y_pred_rf_large, f"Confusion Matrix - {best_rf_key}")

## 🌳 5. Visualizar uma Árvore de Decisão

In [None]:
final_tree = tree_b.named_steps['model']
cat_names = tree_b.named_steps['preprocess'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names = numeric_features + list(cat_names)

plt.figure(figsize=(20,10))
plot_tree(final_tree, feature_names=feature_names, class_names=['Não Sobreviveu', 'Sobreviveu'],
          filled=True, rounded=True, fontsize=8)
plt.title("Árvore de Decisão (criterion='entropy')")
plt.show()

## 🧾 6. Relatórios de Classificação

In [None]:
print("\nGaussianNB:\n", classification_report(y_test, y_pred_gnb))
print(f"\n{best_tree_key}:\n", classification_report(y_test, y_pred_tree_b))
print(f"\n{best_rf_key}:\n", classification_report(y_test, y_pred_rf_large))