In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import warnings

# Abaikan warning terkait mode dan future warning lainnya
warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Load train dataset
df = pd.read_csv('titanic.csv')

# 2. Load test dataset
test_df = pd.read_csv('titanic_test.csv')

# 3. Train data: Fitur (Age, Fare) dan hapus missing value
train_data_raw = df[['Age', 'Fare']]
pos_missing_train = train_data_raw[train_data_raw.isnull().any(axis=1)].index.tolist()
train_data = train_data_raw.dropna().reset_index(drop=True)

# 4. Test data: Fitur (Age, Fare) dan hapus missing value
test_data_raw = test_df[['Age', 'Fare']]
pos_missing_test = test_data_raw[test_data_raw.isnull().any(axis=1)].index.tolist()
test_data = test_data_raw.dropna().reset_index(drop=True)

# 5. Train label: kolom Survived dari train dataset, yang bukan missing
train_label = df.drop(index=pos_missing_train)['Survived'].reset_index(drop=True)

# 6. Test label: dari file titanic_testlabel.csv, yang bukan missing
test_label_all = pd.read_csv('titanic_testlabel.csv')
test_label = test_label_all.drop(index=pos_missing_test).reset_index(drop=True)
test_label = test_label.iloc[:, 0]  # pastikan hanya 1 kolom

# 7. Normalisasi Min-Max (0–1) pada train_data
scaler = MinMaxScaler()
train_data_norm = pd.DataFrame(scaler.fit_transform(train_data), columns=['Age', 'Fare'])

# Catat min dan max
min_vals = scaler.data_min_
max_vals = scaler.data_max_

# 8. Normalisasi test_data dengan min-max dari train
test_data_norm = pd.DataFrame(scaler.transform(test_data), columns=['Age', 'Fare'])

# 9. Klasifikasi KNN untuk k = 1 sampai 10
error_results = []
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_data_norm, train_label)
    class_result = knn.predict(test_data_norm)
    
    # Bandingkan hasil klasifikasi
    errors = np.sum(class_result != test_label.values)
    error_ratio = errors / len(test_label)
    
    error_results.append({
        'k': k,
        'Errors': errors,
        'Total Test Data': len(test_label),
        'Error Ratio': round(error_ratio, 4)
    })

# 10. Tampilkan hasil perbandingan dalam bentuk tabel
error_df = pd.DataFrame(error_results)
print("📊 Tabel Error Ratio untuk Setiap Nilai k:")
display(error_df)

# Tampilkan min dan max
minmax_df = pd.DataFrame([min_vals, max_vals], index=["Min", "Max"], columns=['Age', 'Fare'])
print("\n📉 Nilai Min dan Max dari Train Data:")
display(minmax_df)

# Tampilkan posisi data yang hilang
print("\n📍 Posisi Data Hilang (Train):")
print(pos_missing_train)

print("\n📍 Posisi Data Hilang (Test):")
print(pos_missing_test)


📊 Tabel Error Ratio untuk Setiap Nilai k:


Unnamed: 0,k,Errors,Total Test Data,Error Ratio
0,1,331,331,1.0
1,2,331,331,1.0
2,3,331,331,1.0
3,4,331,331,1.0
4,5,331,331,1.0
5,6,331,331,1.0
6,7,331,331,1.0
7,8,331,331,1.0
8,9,331,331,1.0
9,10,331,331,1.0



📉 Nilai Min dan Max dari Train Data:


Unnamed: 0,Age,Fare
Min,0.42,0.0
Max,80.0,512.3292



📍 Posisi Data Hilang (Train):
[5, 17, 19, 26, 28, 29, 31, 32, 36, 42, 45, 46, 47, 48, 55, 64, 65, 76, 77, 82, 87, 95, 101, 107, 109, 121, 126, 128, 140, 154, 158, 159, 166, 168, 176, 180, 181, 185, 186, 196, 198, 201, 214, 223, 229, 235, 240, 241, 250, 256, 260, 264, 270, 274, 277, 284, 295, 298, 300, 301, 303, 304, 306, 324, 330, 334, 335, 347, 351, 354, 358, 359, 364, 367, 368, 375, 384, 388, 409, 410, 411, 413, 415, 420, 425, 428, 431, 444, 451, 454, 457, 459, 464, 466, 468, 470, 475, 481, 485, 490, 495, 497, 502, 507, 511, 517, 522, 524, 527, 531, 533, 538, 547, 552, 557, 560, 563, 564, 568, 573, 578, 584, 589, 593, 596, 598, 601, 602, 611, 612, 613, 629, 633, 639, 643, 648, 650, 653, 656, 667, 669, 674, 680, 692, 697, 709, 711, 718, 727, 732, 738, 739, 740, 760, 766, 768, 773, 776, 778, 783, 790, 792, 793, 815, 825, 826, 828, 832, 837, 839, 846, 849, 859, 863, 868, 878, 888]

📍 Posisi Data Hilang (Test):
[10, 22, 29, 33, 36, 39, 41, 47, 54, 58, 65, 76, 83, 84, 85, 88, 91, 93, 102