In [1]:
# import  
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# read training data to a DataFrame df
df = pd.read_csv('train_titanic.csv')

# Preprocessing

# drop misssing 'Embarked' values
df = df.dropna(subset=['Embarked'])

# fill in missing 'Age values'
median_value = df['Age'].median()
df['Age'] = df['Age'].fillna(median_value)

# convert 'Sex' column type to category
df['Sex'] = df['Sex'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Sex']], prefix_sep = '_')

# add new binary 'Female' column to df for numerical analysis
#df = pd.concat([df, dummies], axis = 1)
df['Female'] = dummies['Sex_female']

# convert 'Embarked' column type to category
df['Embarked'] = df['Embarked'].astype('category')

# Dummy variable encoding
dummies = pd.get_dummies(df[['Embarked']], prefix_sep = '_')

# add new binary 'Embarked' columns to df for numerical analysis
df['Embarked_C'] = dummies['Embarked_C']
df['Embarked_S'] = dummies['Embarked_S']

# Fitting a KNN model

# Building X, y Numpy arrays from the DataFrame df numeric columns to feed into the model
X = df.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis = 1).values
y = df['Survived'].values

# Scaling, knn, CV in Pipeline
steps = [
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
parameters = {'knn__n_neighbors' : np.arange(1, 50)}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                    random_state = 1, 
                                                    stratify = y)

cv = GridSearchCV(pipeline, param_grid = parameters, cv = 5) # default cv=3, (5 in version 0.22)
cv.fit(X_train, y_train)

y_pred = cv.predict(X_test)

print(cv.best_params_)
print(cv.best_score_)

print(cv.score(X_test, y_test))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


{'knn__n_neighbors': 14}
0.7990353697749196
0.8089887640449438
[[147  18]
 [ 33  69]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       165
           1       0.79      0.68      0.73       102

    accuracy                           0.81       267
   macro avg       0.80      0.78      0.79       267
weighted avg       0.81      0.81      0.81       267

