In [None]:
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

# Matikan warning SciPy mode future behavior
warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Load dataset train
dataset = pd.read_csv("titanic.csv")
print("1. Dataset Train Loaded:")
display(dataset.head())

# 2. Load dataset test
test_dataset = pd.read_csv("titanic_test.csv")
print("\n2. Dataset Test Loaded:")
display(test_dataset.head())

# 3. Ambil fitur (Age, Fare) dari train dan hapus missing values
train_data = dataset[['Age', 'Fare']]
pos_missing_train = train_data[train_data.isnull().any(axis=1)].index
train_data_cleaned = train_data.dropna().reset_index(drop=True)
print(f"\n3. Train Data (Age, Fare) setelah dibersihkan dari missing values. Total: {len(train_data_cleaned)} baris")
display(train_data_cleaned.head())

# 4. Ambil fitur dari test dan hapus missing values
test_data = test_dataset[['Age', 'Fare']]
pos_missing_test = test_data[test_data.isnull().any(axis=1)].index
test_data_cleaned = test_data.dropna().reset_index(drop=True)
print(f"\n4. Test Data (Age, Fare) setelah dibersihkan dari missing values. Total: {len(test_data_cleaned)} baris")
display(test_data_cleaned.head())

# 5. Ambil label train (Survived), sesuai index bersih
train_label = dataset.loc[~dataset.index.isin(pos_missing_train), 'Survived'].reset_index(drop=True)
print("\n5. Train Label setelah disesuaikan:")
display(train_label.head())

# 6. Load test label dan sesuaikan
test_label = pd.read_csv("titanic_testlabel.csv")

# Ambil kolom pertama sebagai label, lalu buang baris sesuai missing
if test_label.shape[1] == 1:
    test_label_cleaned = test_label.squeeze()
else:
    test_label_cleaned = test_label.iloc[:, 0]

test_label_cleaned = test_label_cleaned.loc[~test_label_cleaned.index.isin(pos_missing_test)].reset_index(drop=True)
print("\n6. Test Label setelah disesuaikan:")
display(test_label_cleaned.head())

# 7. Normalisasi train_data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data_cleaned)
print("\n7. Train Data setelah dinormalisasi:")
display(pd.DataFrame(train_data_scaled, columns=['Age', 'Fare']).head())

# 8. Normalisasi test_data dengan scaler yang sama
test_data_scaled = scaler.transform(test_data_cleaned)
print("\n8. Test Data setelah dinormalisasi:")
display(pd.DataFrame(test_data_scaled, columns=['Age', 'Fare']).head())

# 9-10. Klasifikasi KNN k=1 hingga 10 dan tampilkan error ratio
print("\n9-10. Hasil Klasifikasi dan Error Ratio:")

for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_data_scaled, train_label)
    class_result = knn.predict(test_data_scaled)

    test_array = test_label_cleaned.to_numpy()

    error_count = np.sum(class_result != test_array)
    error_ratio = error_count / len(test_array)

    print(f"k = {k}, Error Ratio = {error_ratio:.2f}")
print("Prediksi:", class_result[:5])
print("Label Asli:", test_array[:5])


1. Dataset Train Loaded:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



2. Dataset Test Loaded:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S



3. Train Data (Age, Fare) setelah dibersihkan dari missing values. Total: 714 baris


Unnamed: 0,Age,Fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05



4. Test Data (Age, Fare) setelah dibersihkan dari missing values. Total: 331 baris


Unnamed: 0,Age,Fare
0,34.5,7.8292
1,47.0,7.0
2,62.0,9.6875
3,27.0,8.6625
4,22.0,12.2875



5. Train Label setelah disesuaikan:


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


6. Test Label setelah disesuaikan:


0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64


7. Train Data setelah dinormalisasi:


Unnamed: 0,Age,Fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713



8. Test Data setelah dinormalisasi:


Unnamed: 0,Age,Fare
0,0.428248,0.015282
1,0.585323,0.013663
2,0.773813,0.018909
3,0.334004,0.016908
4,0.271174,0.023984



9-10. Hasil Klasifikasi dan Error Ratio:
k = 1, Error Ratio = 1.00
k = 2, Error Ratio = 1.00
k = 3, Error Ratio = 1.00
k = 4, Error Ratio = 1.00
k = 5, Error Ratio = 1.00
k = 6, Error Ratio = 1.00
k = 7, Error Ratio = 1.00
k = 8, Error Ratio = 1.00
k = 9, Error Ratio = 1.00
k = 10, Error Ratio = 1.00
Prediksi: [0 0 0 1 0]
Label Asli: [892 893 894 895 896]
