# 02 - Engenharia de Features | Feature Engineering

PT-BR: Este notebook demonstra geração de novas features (polinomiais, interações, escalonamento, encoding), uso de Pipeline do scikit-learn, seleção de features, validação e visualização.

EN: This notebook demonstrates generating new features (polynomial, interactions, scaling, encoding), using scikit-learn Pipelines, feature selection, validation, and visualization.

In [None]:
"""
PT-BR: Imports e configuração.
EN: Imports and setup.
"""
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import FunctionTransformer
ROOT = Path.cwd()
%config InlineBackend.figure_format = 'retina'
sns.set(style='whitegrid', context='notebook')


In [None]:
# 1) Dados sintéticos com colunas numéricas e categóricas
def generate_dataset(n_samples=800, random_state=42):
    X, y = make_classification(n_samples=n_samples, n_features=6, n_informative=4,
                               n_redundant=0, n_repeated=0, n_classes=3,
                               random_state=random_state)
    df = pd.DataFrame(X, columns=[f'num_{i}' for i in range(6)])
    rng = np.random.default_rng(random_state)
    df['cat_a'] = rng.choice(['A','B','C'], size=n_samples, p=[0.5,0.3,0.2])
    df['cat_b'] = rng.choice(['X','Y'], size=n_samples)
    # Missing em uma coluna
    miss_idx = rng.choice(df.index, size=int(0.08*len(df)), replace=False)
    df.loc[miss_idx, 'num_0'] = np.nan
    df['target'] = y
    return df
df = generate_dataset()
df.head()


In [None]:
# 2) Divisão treino/teste
num_cols = [c for c in df.columns if c.startswith('num_')]
cat_cols = ['cat_a','cat_b']
X = df[num_cols + cat_cols].copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=42, stratify=y)
X_train.shape, X_test.shape


## 3) Engenharia de features numéricas

PT-BR: Vamos imputar, escalar, e adicionar termos polinomiais (incluindo interações).

EN: We will impute, scale, and add polynomial terms (including interactions).


In [None]:
numeric_poly = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False))
])
# 4) Encoding categórico
categorical_enc = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', drop=None, sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_poly, num_cols),
    ('cat', categorical_enc, cat_cols)
])
preprocessor


## 5) Seleção de features

PT-BR: Após a expansão polinomial, podemos selecionar as k melhores features por ANOVA F.

EN: After polynomial expansion, we can select the top-k features using ANOVA F.


In [None]:
k_best = 25  # ajuste conforme necessário
clf = Pipeline(steps=[
    ('pre', preprocessor),
    ('select', SelectKBest(score_func=f_classif, k=k_best)),
    ('model', LogisticRegression(max_iter=2000, multi_class='auto'))
])
clf


## 6) Validação cruzada e treino

PT-BR: Usamos StratifiedKFold para avaliar a robustez do pipeline.

EN: We use StratifiedKFold to evaluate the pipeline robustness.


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=None)
print('CV mean:', cv_scores.mean().round(4), '±', cv_scores.std().round(4))
clf.fit(X_train, y_train)
print('Train acc:', clf.score(X_train, y_train).round(4))
print('Test acc :', clf.score(X_test, y_test).round(4))
print(classification_report(y_test, clf.predict(X_test)))


## 7) Inspecionando importância/seleção (aproximada)

PT-BR: Extraímos os scores do SelectKBest para visualizar as features mais relevantes.

EN: We extract SelectKBest scores to visualize the most relevant features.


In [None]:
# Recuperar nomes de features após o preprocessor
num_feat_names = []
# nomes originais numéricos
num_feat_names_raw = num_cols
poly = clf.named_steps['pre'].named_transformers_['num'].named_steps['poly']
poly_names = poly.get_feature_names_out(num_feat_names_raw).tolist()
# nomes categóricos após OHE
ohe = clf.named_steps['pre'].named_transformers_['cat'].named_steps['ohe']
cat_names = ohe.get_feature_names_out(cat_cols).tolist()
all_names = poly_names + cat_names
# Seleção
selector = clf.named_steps['select']
scores = selector.scores_
selected_mask = selector.get_support()
selected_names = np.array(all_names)[selected_mask]
selected_scores = scores[selected_mask]
feat_importance = (pd.DataFrame({'feature': selected_names, 'score': selected_scores})
                   .sort_values('score', ascending=False))
feat_importance.head(15)


In [None]:
# 8) Visualizações
plt.figure(figsize=(8,5))
sns.barplot(data=feat_importance.head(15), x='score', y='feature', orient='h')
plt.title('Top 15 features por score (ANOVA F) | Top 15 features by score')
plt.tight_layout()
plt.show()
# Plotly: relação de duas features selecionadas
if len(selected_names) >= 2:
    # Reconstruir matriz transformada para duas features escolhidas
    X_train_trans = clf.named_steps['pre'].fit_transform(X_train, y_train)
    X_all_names = all_names
    df_plot = pd.DataFrame(X_train_trans, columns=X_all_names)
    f1, f2 = selected_names[:2]
    fig = px.scatter(df_plot, x=f1, y=f2, color=y_train.astype(str),
                     title='Relação entre duas features selecionadas | Relation between two selected features')
    fig.show()


## 9) Exportar artefatos

PT-BR: Salvamos o dataset transformado de treino e teste (apenas para referência).

EN: We save transformed train and test datasets (for reference).


In [None]:
out_dir = Path('data/processed')
out_dir.mkdir(parents=True, exist_ok=True)
Xt_train = clf.named_steps['pre'].fit_transform(X_train, y_train)
Xt_test = clf.named_steps['pre'].transform(X_test)
Xt_cols = all_names
pd.DataFrame(Xt_train, columns=Xt_cols).to_csv(out_dir / 'train_features.csv', index=False)
pd.DataFrame(Xt_test, columns=Xt_cols).to_csv(out_dir / 'test_features.csv', index=False)
print('Saved:', out_dir / 'train_features.csv')
print('Saved:', out_dir / 'test_features.csv')


PT-BR: Fim do notebook. Próximo: modelagem e tuning.

EN: End of notebook. Next: modeling and tuning.
