In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config

set_config(transform_output="pandas")

df = sns.load_dataset("titanic")
df

> Supongamos que queremos predecir la posibilidad que un pasajero sobreviva o muera luego del accidente del Titanic.

In [None]:
df.dtypes.value_counts().plot(
    kind="bar",
    edgecolor="k",
    title="Tipos de Variable presente en Titanic",
)
plt.tight_layout()

## Supongamos que utilizaremos las siguientes variables

In [None]:
X = df[["class", "sex", "embark_town", "fare", "age"]]
y = df.alive

X.shape, y.shape

## EDA

In [None]:
num_cols = X.select_dtypes(np.number).columns.tolist()
cat_cols = [col for col in X.columns if col not in num_cols]
print(f"Variables Numéricas: {num_cols}")
print(f"Variables Categóricas: {cat_cols}")

#### Valores Faltantes (Nulos)

In [None]:
X.isnull().mean().plot(
    kind="bar",
    edgecolor="k",
    title="Cantidad de Valores Nulos en el Titanic",
)
plt.tight_layout()

## Variables Numéricas

In [None]:
X.hist(grid=False, edgecolor="k")
plt.suptitle("Distribución de Variables Numéricas")
plt.tight_layout()

## Variables Categóricas

In [None]:
color = ["red", "blue", "green"]
for cat, color in zip(cat_cols, color):
    df[cat].value_counts().plot(
        kind="bar",
        edgecolor="k",
        color=color,
        title=f"Categorías para '{cat}'",
    )
    plt.show()

## Preprocesamiento

In [None]:
from feature_engine.imputation import CategoricalImputer

ci = CategoricalImputer(imputation_method="frequent")
X_imp = ci.fit_transform(X)
X_imp

In [None]:
from feature_engine.imputation import MeanMedianImputer

mmi = MeanMedianImputer(imputation_method="mean")
X_imp = mmi.fit_transform(X_imp)
X_imp

In [None]:
from feature_engine.encoding import OneHotEncoder

ohe = OneHotEncoder()
X_ohe = ohe.fit_transform(X_imp)
X_ohe

In [None]:
from sklearn.preprocessing import StandardScaler

sc_all = StandardScaler()
X_sc_all = sc_all.fit_transform(X_ohe)
X_sc_all

In [None]:
from feature_engine.wrappers import SklearnTransformerWrapper

sc = SklearnTransformerWrapper(StandardScaler(), variables=["fare", "age"])
X_sc = sc.fit_transform(X_ohe)
X_sc

## Entrenamiento del Modelo

In [None]:
from sklearn.neighbors import KNeighborsClassifier


def knn_clf(X, y, k=5, prep=""):
    knn = KNeighborsClassifier(
        n_neighbors=k, metric="euclidean", n_jobs=-1
    )
    ## Notar que es posible utilizar Variables categóricas como Etiquetas...
    knn.fit(X, y)
    y_pred = knn.predict(X)
    print(
        f"Score k = {k}, y Preprocesamiento: {prep}: {knn.score(X,y):.4f}"
    )
    return y_pred


for k in [3, 5, 7, 9, 11, 13, 15]:
    print(
        "================================================================="
    )
    y_pred_sc = knn_clf(X_sc, y, k=k, prep="StandardScaler Numérico")
    y_pred_sc_all = knn_clf(X_sc_all, y, k=k, prep="StandardScaler a todo")
    y_pred_ohe = knn_clf(X_ohe, y, k=k, prep="Sin Escalar")

> Conclusión: Los Preprocesamientos afectan de manera importante el entrenamiento de un modelo. 

## Uso de Pipelines

In [None]:
from sklearn.pipeline import Pipeline


def model_pipeline(num_method, cat_method, k=5):
    pipe = Pipeline(
        steps=[
            ("num_imp", MeanMedianImputer(imputation_method=num_method)),
            ("cat_imp", CategoricalImputer(imputation_method=cat_method)),
            ("ohe", OneHotEncoder()),
            ("sc", StandardScaler()),
            ("model", KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
        ]
    )

    return pipe


pipe = model_pipeline(num_method="mean", cat_method="frequent", k=5)
pipe

In [None]:
pipe.fit(X, y)
y_pred = pipe.predict(X)
pipe.score(X, y)

In [None]:
y_pred