In [1]:
import pandas as pd

In [2]:
data = pd.read_excel(
    "./Titanic.xlsx",
    index_col="PassengerId",
    usecols=["PassengerId", "Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    )

In [3]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [4]:
# Replace nominal/ordinal data to numeric
data["Embarked"].replace({'S':3, 'C':2, 'Q':1}, inplace=True)
data["Sex"].replace({'female':0, 'male':1}, inplace=True)

In [5]:
#split data
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.1, shuffle=True, random_state=23)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

### PIPELINE ###
################

# Pipeline desired data transformers, along with an estimator at the end
# For each step specify: a name, the actual transformer/estimator with its parameters
classifier = Pipeline([
    ('scaler', MinMaxScaler()),
    ('estimator', KNeighborsClassifier(n_neighbors = 3))
])

# Visualize the pipeline
# This will come in handy especially when building more complex pipelines, stringing together multiple preprocessing steps
from sklearn import set_config
set_config(display='diagram')
classifier

In [7]:
features = data.columns.to_list()
features.remove("Survived")
features

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [8]:
x_train = train_data[features]
y_train = train_data["Survived"]

In [9]:
classifier.fit(x_train.values, y_train)

In [10]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

# Use the fitted model to make predictions on the train dataset
# Train data going through the Pipeline it's first imputed (with means from the train), scaled (with the min/max from the train data), and finally used to make predictions
train_predictions = classifier.predict(x_train.values)

print('Model performance on the train set:')
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print("Train accuracy:", accuracy_score(y_train, train_predictions))

Model performance on the train set:
[[442  40]
 [ 62 257]]
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       482
           1       0.87      0.81      0.83       319

    accuracy                           0.87       801
   macro avg       0.87      0.86      0.87       801
weighted avg       0.87      0.87      0.87       801

Train accuracy: 0.8726591760299626


In [11]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

# Get test data to test the classifier
x_test = test_data[features]
y_test = test_data["Survived"]

# Use the fitted model to make predictions on the test dataset
# Test data going through the Pipeline it's first imputed (with means from the train), scaled (with the min/max from the train data), and finally used to make predictions
test_predictions = classifier.predict(x_test.values)

print('Model performance on the test set:')
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print("Test accuracy:", accuracy_score(y_test, test_predictions))

Model performance on the test set:
[[57 10]
 [ 5 18]]
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        67
           1       0.64      0.78      0.71        23

    accuracy                           0.83        90
   macro avg       0.78      0.82      0.79        90
weighted avg       0.85      0.83      0.84        90

Test accuracy: 0.8333333333333334


In [12]:
from joblib import dump
dump(classifier, "./titanic.joblib")

['./titanic.joblib']

In [13]:
prediction = classifier.predict([[3, 1, 38.0, 0, 0, 71.500, 0]])
#Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
prediction[0]

0

In [14]:
prediction = classifier.predict([[1, 0, 38.0, 0, 0, 71.500, 0]])
#Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
prediction[0]

1