**Teknik Klasifikasi (KNN, Decision Tree, Random Forest, XGBoost)**

## **Import package dan data**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load data
data = pd.read_csv('titanic-train.csv')
data.set_index("PassengerId", inplace=True)
data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
# Merubah data kategorikal menjadi data numerikal
# Sex 1=male 0=female
# Embarked C=0 Q=1 S=2

label_enconder = LabelEncoder()
data['Sex'] = label_enconder.fit_transform(data['Sex'])
data['Embarked'] = label_enconder.fit_transform(data['Embarked'])

In [None]:
# Mengisi missing value

mean_age = data['Age'].mean()
data['Age'].fillna(mean_age,inplace=True)
data['Age'] = data['Age'].round().astype(int)

## **Definisi variabel dan split data**

In [None]:
X = data.drop(['Survived','Name','Ticket','Cabin'], axis=1)
y = data['Survived']

In [None]:
X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22,1,0,7.2500,2
2,1,0,38,1,0,71.2833,0
3,3,0,26,0,0,7.9250,2
4,1,0,35,1,0,53.1000,2
5,3,1,35,0,0,8.0500,2
...,...,...,...,...,...,...,...
887,2,1,27,0,0,13.0000,2
888,1,0,19,0,0,30.0000,2
889,3,0,30,1,2,23.4500,2
890,1,1,26,0,0,30.0000,0


In [None]:
y

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [None]:
# Split data training : data test = 80% : 20%

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [None]:
X_train

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
411,3,1,30,0,0,7.8958,2
639,3,0,41,0,5,39.6875,2
396,3,1,22,0,0,7.7958,2
222,2,1,27,0,0,13.0000,2
801,2,1,34,0,0,13.0000,2
...,...,...,...,...,...,...,...
318,2,1,54,0,0,14.0000,2
144,3,1,19,0,0,6.7500,1
475,3,0,22,0,0,9.8375,2
319,1,0,31,0,2,164.8667,2


In [None]:
X_test

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
761,3,1,30,0,0,14.5000,2
774,3,1,30,0,0,7.2250,0
267,3,1,16,4,1,39.6875,2
508,1,1,30,0,0,26.5500,2
525,3,1,30,0,0,7.2292,0
...,...,...,...,...,...,...,...
220,2,1,30,0,0,10.5000,2
263,1,1,52,1,1,79.6500,2
394,1,0,23,1,0,113.2750,0
463,1,1,47,0,0,38.5000,2


# **Evaluasi Model**

## Model

In [None]:
# K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

## Confusion Matrix

In [None]:
actual = y_test  # Actual target values

# Create confusion matrices
knn_cm = confusion_matrix(actual, knn_pred)
dt_cm = confusion_matrix(actual, dt_pred)
rf_cm = confusion_matrix(actual, rf_pred)
xgb_cm = confusion_matrix(actual, xgb_pred)

# Convert confusion matrices to DataFrames
def confusion_matrix_to_dataframe(cm):
    labels = sorted(set(actual))
    df_cm = pd.DataFrame(cm, index=labels, columns=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    return df_cm

# Convert confusion matrices to DataFrames with labels
knn_cm_df = confusion_matrix_to_dataframe(knn_cm)
dt_cm_df = confusion_matrix_to_dataframe(dt_cm)
rf_cm_df = confusion_matrix_to_dataframe(rf_cm)
xgb_cm_df = confusion_matrix_to_dataframe(xgb_cm)

In [None]:
knn_cm_df

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89,23
1,27,40


In [None]:
dt_cm_df

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89,23
1,16,51


In [None]:
rf_cm_df

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93,19
1,17,50


In [None]:
xgb_cm_df

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,92,20
1,17,50


Kesimpulan :
- Model KNN memiliki jumlah prediksi salah terbesar dan jumlah prediksi benar lebih sedikit dibanding model lain sehingga bukan merupakan model terbaik
- Model Decision Tree, Random Forest, dan XGBoost memiliki jumlah prediksi salah dan benar yang tidak jauh berbeda
- Model Random Forest merupakan model terbaik karena memiliki jumlah prediksi benar lebih besar dan jumlah prediksi salah lebih sedikit dibanding model lain. Walaupun jumlah false negative nya lebih besar dibanding model Decision Tree, jumlah false positive nya masih lebih sedikit. Akan lebih baik apabila actual nya selamat tapi diprediksi tidak selamat. Apabila sebaliknya, tentu akan memberikan harapan kepada keluarga korban dan mengeluarkan biaya lebih untuk evakuasi, dll.

## Accuracy, Precision, Recall, F1

In [None]:
def evaluate(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return cm, accuracy, precision, recall, f1


# Evaluate KNN
knn_cm, knn_accuracy, knn_precision, knn_recall, knn_f1 = evaluate(y_test, knn_pred)

# Evaluate Decision Tree
dt_cm, dt_accuracy, dt_precision, dt_recall, dt_f1 = evaluate(y_test, dt_pred)

# Evaluate Random Forest
rf_cm, rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate(y_test, rf_pred)

# Evaluate XGBoost
xgb_cm, xgb_accuracy, xgb_precision, xgb_recall, xgb_f1 = evaluate(y_test, xgb_pred)

In [None]:
knn_accuracy

0.7206703910614525

In [None]:
dt_accuracy

0.7821229050279329

In [None]:
rf_accuracy

0.7988826815642458

In [None]:
xgb_accuracy

0.7932960893854749

In [None]:
knn_f1

0.7188232721899289

In [None]:
dt_f1

0.7840170550185387

In [None]:
rf_f1

0.7994555515094458

In [None]:
xgb_f1

0.7941563031694505

Kesimpulan :

Dari tingkat Accuracy dan F1 terlihat bahwa model Random Forest memiliki performa terbaik dengan tingkat Accuracy hampir 80% dalam memprediksi dengan benar penumpang yang berhasil selamat/tidak pada kejadian tenggelamnya kapal titanic

# **Feature Importance**

In [None]:
xgb_feature_importance = xgb_model.feature_importances_
feature_names = X.columns

xgb_feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': xgb_feature_importance})
xgb_feature_importance_df = xgb_feature_importance_df.sort_values(by='Importance', ascending=False)
xgb_feature_importance_df

Unnamed: 0,Feature,Importance
1,Sex,0.548223
0,Pclass,0.204491
3,SibSp,0.095668
5,Fare,0.041693
4,Parch,0.040553
2,Age,0.038691
6,Embarked,0.03068


Kesimpulan :

Dapat dilihat dari Feature Importance, variabel yang paling berpengaruh terhadap selamat/tidak penumpang kapal titanic adalah variabel sex/gender. Bisa jadi gender memang mempengaruhi kemampuan penumpang untuk menyelamatkan diri. Variabel pclass/ticket class juga cukup berpengaruh karena penumpang dengan tingkat kelas atas menempati bagian kapal yang lebih nyaman dan aman. Sedangkan embarked/keberangkatan menjadi variabel yang paling tidak berpengaruh karena tempat keberangkatan kurang berhubungan dengan kejadian tenggelamnya kapal titanic.

# **Prediksi Data Test**

In [None]:
# Load data dan drop kolom

new_data = pd.read_csv('titanic-test.csv')
new_data.set_index("PassengerId", inplace=True)
new_data.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)

In [None]:
# Merubah data kategorikal menjadi data numerikal
# Sex 1=male 0=female
# Embarked C=0 Q=1 S=2

label_enconder = LabelEncoder()
new_data['Sex'] = label_enconder.fit_transform(new_data['Sex'])
new_data['Embarked'] = label_enconder.fit_transform(new_data['Embarked'])

In [None]:
# Mengisi missing value

mean_age = new_data['Age'].mean()
new_data['Age'].fillna(mean_age,inplace=True)
new_data['Age'] = new_data['Age'].round().astype(int)
mean_fare = new_data['Fare'].mean()
new_data['Fare'].fillna(mean_fare,inplace=True)

In [None]:
new_data

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,34,0,0,7.8292,1
893,3,0,47,1,0,7.0000,2
894,2,1,62,0,0,9.6875,1
895,3,1,27,0,0,8.6625,2
896,3,0,22,1,1,12.2875,2
...,...,...,...,...,...,...,...
1305,3,1,30,0,0,8.0500,2
1306,1,0,39,0,0,108.9000,0
1307,3,1,38,0,0,7.2500,2
1308,3,1,30,0,0,8.0500,2


In [None]:
# Make predictions using the trained models
knn_predictions = knn.predict(new_data)
dt_predictions = dt.predict(new_data)
rf_predictions = rf.predict(new_data)
xgb_predictions = xgb_model.predict(new_data)

combined_data = new_data.copy()  # Copy the new dataset

# Add columns for predicted labels
combined_data['KNN_Predictions'] = knn_predictions
combined_data['DT_Predictions'] = dt_predictions
combined_data['RF_Predictions'] = rf_predictions
combined_data['XGB_Predictions'] = xgb_predictions

In [None]:
combined_data

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,KNN_Predictions,DT_Predictions,RF_Predictions,XGB_Predictions
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,3,1,34,0,0,7.8292,1,0,0,0,0
893,3,0,47,1,0,7.0000,2,0,0,0,0
894,2,1,62,0,0,9.6875,1,0,0,0,0
895,3,1,27,0,0,8.6625,2,1,1,1,0
896,3,0,22,1,1,12.2875,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1305,3,1,30,0,0,8.0500,2,0,0,0,0
1306,1,0,39,0,0,108.9000,0,1,1,1,1
1307,3,1,38,0,0,7.2500,2,0,0,0,0
1308,3,1,30,0,0,8.0500,2,0,0,0,0


Kesimpulan :

Tabel di atas menampilkan hasil prediksi dari 4 model klasifikasi yang sudah dibuat sebelumnya. Keempat prediksi tersebut dapat menjadi perbandingan untuk meyakinkan hasil prediksi apa yang paling memungkinkan. Akan tetapi, bisa juga langsung melihat hasil prediksi dari model Random Forest saja karena merupakan model terbaik dari hasil evaluasi model sebelumnya.