In [1]:
# import library dan package

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
# Import data dan drop kolom

df = pd.read_csv("data/titanic.csv", index_col="PassengerId")
df.drop(columns=["Name", "Ticket", "Age", "Cabin"], inplace=True)

In [3]:
# Dataset Splitting

x = df.drop(columns="Survived")
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [4]:
# Preprocessor

# pipeline untuk kolom data numerikal
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler())
])

# pipeline untuk kolom data kategorikal
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ["SibSp", "Parch", "Fare"]),  # pilih kolom data numerikal
    ("categoric", categorical_pipeline, ["Pclass", "Sex", "Embarked"]) # pilih kolom data kategorikal
])

In [5]:
# Pipeline

pipeline = Pipeline([
    ("prep", preprocessor),
    ("algoritma", KNeighborsClassifier())
])

In [6]:
# Parameter Tuning GridSearchCV

parameter = {
    "algoritma__n_neighbors": range(1, 51, 2),
    "algoritma__weights": ["uniform", "distance"],      # tentukan parameter apa saja yang ingin di grid search
    "algoritma__p": [1, 2]
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.4s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('impu

In [8]:
# Model Evaluation

print(model.best_params_)
print(model.score(x_train, y_train), model.best_score_, model.score(x_test, y_test))

{'algoritma__n_neighbors': 19, 'algoritma__p': 1, 'algoritma__weights': 'uniform'}
0.8188202247191011 0.8131995414199436 0.7877094972067039
