In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import set_config

set_config(transform_output="pandas")

df = sns.load_dataset("titanic")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [2]:
X = df[["class", "sex", "embark_town", "fare", "age"]]
y = df.alive

X.shape, y.shape

((891, 5), (891,))

In [3]:
num_cols = X.select_dtypes(np.number).columns.tolist()
cat_cols = [col for col in X.columns if col not in num_cols]
num_cols, cat_cols

(['fare', 'age'], ['class', 'sex', 'embark_town'])

## Holdout (Train Test Split)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)
X_train.shape, X_test.shape

((668, 5), (223, 5))

In [5]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.neighbors import KNeighborsClassifier


def make_pipeline(k, scale=None):
    if scale is not None:
        scaler = SklearnTransformerWrapper(
            StandardScaler(), variables=num_cols
        )
    else:
        scaler = StandardScaler()

    pipe = Pipeline(
        steps=[
            ("ci", CategoricalImputer(imputation_method="frequent")),
            ("mmi", MeanMedianImputer(imputation_method="mean")),
            ("ohe", OneHotEncoder()),
            ("sc", scaler),
            ("model", KNeighborsClassifier(n_neighbors=k)),
        ]
    )
    return pipe


pipe = make_pipeline(k=5)
# pipe = make_pipeline(k=5, scale = num_cols)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
pipe.score(X_test, y_test)

0.7757847533632287

## 3-way Holdout (Train Validation Test Split)


In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

for k in [3, 5, 7, 9, 11]:
    pipe = make_pipeline(k=k)
    pipe.fit(X_train, y_train)
    metric = pipe.score(X_val, y_val)
    print(f"Puntaje del Modelo, k = {k}: {metric}")
    print("=============================================")

Puntaje del Modelo, k = 3: 0.7982062780269058
Puntaje del Modelo, k = 5: 0.7757847533632287
Puntaje del Modelo, k = 7: 0.7892376681614349
Puntaje del Modelo, k = 9: 0.8026905829596412
Puntaje del Modelo, k = 11: 0.7982062780269058


## K-Fold

In [7]:
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold


def make_kfold(X, y, k, kfold=5):
    kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
    score = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        pipe = make_pipeline(k=k)
        pipe.fit(X_train, y_train)
        metric = pipe.score(X_val, y_val)
        score.append(metric)
        print(f"Metric for Fold {fold}: {metric:.3f}")
    return score


for k in [1, 3, 5, 7, 9, 11]:
    print(f"k = {k}")
    kfold_score = make_kfold(X_trainval, y_trainval, k, kfold=5)
    mean = np.mean(kfold_score)
    std = np.std(kfold_score)
    print(f"K-Fold Score: {mean:.3f} +/- {std:.3f}")
    print("=========================================")

k = 1
Metric for Fold 1: 0.748
Metric for Fold 2: 0.811
Metric for Fold 3: 0.725
Metric for Fold 4: 0.803
Metric for Fold 5: 0.746
K-Fold Score: 0.767 +/- 0.034
k = 3
Metric for Fold 1: 0.769
Metric for Fold 2: 0.853
Metric for Fold 3: 0.789
Metric for Fold 4: 0.789
Metric for Fold 5: 0.810
K-Fold Score: 0.802 +/- 0.029
k = 5
Metric for Fold 1: 0.790
Metric for Fold 2: 0.853
Metric for Fold 3: 0.803
Metric for Fold 4: 0.754
Metric for Fold 5: 0.789
K-Fold Score: 0.798 +/- 0.032
k = 7
Metric for Fold 1: 0.783
Metric for Fold 2: 0.853
Metric for Fold 3: 0.803
Metric for Fold 4: 0.761
Metric for Fold 5: 0.817
K-Fold Score: 0.803 +/- 0.031
k = 9
Metric for Fold 1: 0.790
Metric for Fold 2: 0.846
Metric for Fold 3: 0.789
Metric for Fold 4: 0.761
Metric for Fold 5: 0.810
K-Fold Score: 0.799 +/- 0.028
k = 11
Metric for Fold 1: 0.776
Metric for Fold 2: 0.839
Metric for Fold 3: 0.796
Metric for Fold 4: 0.754
Metric for Fold 5: 0.789
K-Fold Score: 0.791 +/- 0.028


## Versión Reducida

In [8]:
from sklearn.model_selection import cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for k in [1, 3, 5, 7, 9, 11]:
    pipe = make_pipeline(k=k)
    vals = cross_val_score(pipe, X_trainval, y_trainval, cv=kf)
    print(f"k = {k}")
    print(f"Metric: {np.mean(vals):.3f} +/- {np.std(vals):.3f}")

k = 1
Metric: 0.767 +/- 0.034
k = 3
Metric: 0.802 +/- 0.029
k = 5
Metric: 0.798 +/- 0.032
k = 7
Metric: 0.803 +/- 0.031
k = 9
Metric: 0.799 +/- 0.028
k = 11
Metric: 0.791 +/- 0.028


## Calcular Test Scores


In [9]:
for k in [1, 3, 5, 7, 9, 11]:
    pipe = make_pipeline(k=k)
    pipe.fit(X_trainval, y_trainval)
    metric = pipe.score(X_test, y_test)
    print(f"Score for k = {k}: {metric}")

Score for k = 1: 0.7821229050279329
Score for k = 3: 0.8156424581005587
Score for k = 5: 0.770949720670391
Score for k = 7: 0.8044692737430168
Score for k = 9: 0.8212290502793296
Score for k = 11: 0.8156424581005587


> Pero, qué está ocurriendo acá?