In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("data/titanic.csv", index_col="PassengerId")
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=["Name", "Age", "Ticket", "Cabin"], inplace=True)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = df.drop(columns=["Survived"])
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier

In [11]:
numerical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
    ("scaling", MinMaxScaler()),
])

categorical_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("one_hot_encoding", OneHotEncoder()),
])

In [12]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ["SibSp", "Parch", "Fare"]),
    ("categoric", categorical_pipeline, ["Pclass", "Sex", "Embarked"]),
])

knn = KNeighborsClassifier()

In [14]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", knn),
])

In [15]:
from sklearn.model_selection import GridSearchCV

In [23]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ["uniform", "distance"],
    "algo__p": [1, 2],
}

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)

In [24]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer()),
                                                                                         ('scaling',
                                                                                          MinMaxScaler())]),
                                                                         ['SibSp',
                                                                          'Parch',
                                                                          'Fare']),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('impu

In [25]:
model.best_params_, model.best_score_

({'algo__n_neighbors': 19, 'algo__p': 1, 'algo__weights': 'uniform'},
 0.8146060111808436)

In [26]:
model.score(X_train, y_train), model.score(X_test, y_test)

(0.8188202247191011, 0.7877094972067039)