In [13]:
import numpy as np


np.random.seed(42)
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### Dataset preparation


In [14]:
data = pd.read_csv("../data/titanic/sklearn_dataset.csv")

print(data.head())
print(data.shape)

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked Cabin_Cat
0         1       1  female  38.0      1      0  71.2833        C         C
1         1       1  female  35.0      1      0  53.1000        S         C
2         0       1    male  54.0      0      0  51.8625        S         E
3         1       3  female   4.0      1      1  16.7000        S         G
4         1       1  female  58.0      0      0  26.5500        S         C
(183, 9)


In [15]:
data_y = data.Survived
data_x = data.drop(["Survived"], axis=1)

In [16]:
print(data_x)
print(data_y)

     Pclass     Sex   Age  SibSp  Parch     Fare Embarked Cabin_Cat
0         1  female  38.0      1      0  71.2833        C         C
1         1  female  35.0      1      0  53.1000        S         C
2         1    male  54.0      0      0  51.8625        S         E
3         3  female   4.0      1      1  16.7000        S         G
4         1  female  58.0      0      0  26.5500        S         C
..      ...     ...   ...    ...    ...      ...      ...       ...
178       1  female  47.0      1      1  52.5542        S         D
179       1    male  33.0      0      0   5.0000        S         B
180       1  female  56.0      0      1  83.1583        C         C
181       1  female  19.0      0      0  30.0000        S         B
182       1    male  26.0      0      0  30.0000        C         C

[183 rows x 8 columns]
0      1
1      1
2      0
3      1
4      1
      ..
178    1
179    0
180    1
181    1
182    1
Name: Survived, Length: 183, dtype: int64


In [17]:
encder_cls = LabelEncoder

In [18]:
data_sex = data_x["Sex"]
encoder = encder_cls()
encoder.fit(data_sex)
data_x["Sex"] = encoder.transform(data_sex)

data_emb = data_x["Embarked"]
encoder = encder_cls()
encoder.fit(data_emb)
data_x["Embarked"] = encoder.transform(data_emb)

data_cab = data_x["Cabin_Cat"]
encoder = encder_cls()
encoder.fit(data_cab)
data_x["Cabin_Cat"] = encoder.transform(data_cab)

In [19]:
print(f"x shape:\n{data_x.shape}")
print(f"y shape:\n{data_y.shape}")

print(f"x:\n{data_x.iloc[:5]}")
print(f"y:\n{data_x.iloc[:5]}")

x shape:
(183, 8)
y shape:
(183,)
x:
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Cabin_Cat
0       1    0  38.0      1      0  71.2833         0          2
1       1    0  35.0      1      0  53.1000         2          2
2       1    1  54.0      0      0  51.8625         2          4
3       3    0   4.0      1      1  16.7000         2          6
4       1    0  58.0      0      0  26.5500         2          2
y:
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Cabin_Cat
0       1    0  38.0      1      0  71.2833         0          2
1       1    0  35.0      1      0  53.1000         2          2
2       1    1  54.0      0      0  51.8625         2          4
3       3    0   4.0      1      1  16.7000         2          6
4       1    0  58.0      0      0  26.5500         2          2


In [20]:
x = data_x.to_numpy()
y = data_y.to_numpy()

### Dataset Split


In [21]:
num_samples = x.shape[0]
num_features = x.shape[1]
num_classes = y.shape[0]
num_survived = np.count_nonzero(y == 1)
num_not_survived = np.count_nonzero(y == 0)

print(f"Num_samples: {num_samples}")
print(f"Num_features: {num_features}")
print(f"Survived: {num_survived}")
print(f"Not survived: {num_not_survived}")
print(f"Survived rate: {num_survived / num_samples}")
print(f"Not survived rate: {num_not_survived / num_samples}")

Num_samples: 183
Num_features: 8
Survived: 123
Not survived: 60
Survived rate: 0.6721311475409836
Not survived rate: 0.32786885245901637


In [22]:
test_size = num_samples // 3

random_idxs = np.random.permutation(num_samples)

x_train = x[random_idxs[:-test_size]]
y_train = y[random_idxs[:-test_size]]

x_test = x[random_idxs[-test_size:]]
y_test = y[random_idxs[-test_size:]]

In [23]:
print(f"x_train shape:\n{x_train.shape}")
print(f"y_train shape:\n{y_train.shape}")

print(f"x_test shape:\n{x_test.shape}")
print(f"y_test shape:\n{y_test.shape}")

x_train shape:
(122, 8)
y_train shape:
(122,)
x_test shape:
(61, 8)
y_test shape:
(61,)


### KNN Model


In [24]:
from sklearn.tree import DecisionTreeClassifier


clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

accuracy = clf.score(x_test, y_test)
print(f"accuracy: {accuracy * 100.0:.4}%")

y_pred = clf.predict(x_test)
print(f"y_pred:\n{y_pred}")

accuracy: 68.85%
y_pred:
[1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 1
 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1]
