In [77]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler, StandardScaler

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
Y_train = pd.read_csv('../data/Y_train.csv')
Y_test = pd.read_csv('../data/Y_test.csv')

enc = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans")
Y_train_binned = enc.fit_transform(Y_train)
Y_test_binned = enc.fit_transform(Y_test)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape, Y_train_binned.shape, Y_test_binned.shape

((2344, 136), (586, 136), (2344, 1), (586, 1), (2344, 1), (586, 1))

# k-Nearest-Neighbor Classifiers

In [78]:
kNN = KNeighborsClassifier(n_neighbors=2)
kNN.fit(X_train, Y_train_binned.ravel())

y_pred_train = kNN.predict(X_train)
print("Accuracy: {:.3f}".format(accuracy_score(y_true=Y_train_binned,
                                               y_pred=y_pred_train)))

y_pred_test = kNN.predict(X_test)
print("Accuracy: {:.3f}".format(accuracy_score(y_true=Y_test_binned,
                                               y_pred=y_pred_test)))

Accuracy: 0.774
Accuracy: 0.502


# Scaling

In [79]:
for f in X_train.columns:
    print("Feature {:s} -> Scale: {:7.2f}".format(f, X_train.loc[:, f].max() - X_train.loc[:, f].min()))

Feature Lot_Frontage -> Scale:  313.00
Feature Lot_Area -> Scale: 213775.00
Feature Utilities -> Scale:    1.00
Feature Land_Slope -> Scale:    2.00
Feature Overall_Qual -> Scale:    9.00
Feature Overall_Cond -> Scale:    8.00
Feature Year_Built -> Scale:  138.00
Feature Year_Remod_Add -> Scale:   60.00
Feature Mas_Vnr_Area -> Scale: 1600.00
Feature Exter_Qual -> Scale:    3.00
Feature Exter_Cond -> Scale:    3.00
Feature BsmtFin_SF_1 -> Scale:    7.00
Feature BsmtFin_SF_2 -> Scale: 1474.00
Feature Bsmt_Unf_SF -> Scale: 2336.00
Feature Total_Bsmt_SF -> Scale: 5095.00
Feature First_Flr_SF -> Scale: 4761.00
Feature Second_Flr_SF -> Scale: 1872.00
Feature Low_Qual_Fin_SF -> Scale: 1064.00
Feature Gr_Liv_Area -> Scale: 4761.00
Feature Bsmt_Full_Bath -> Scale:    3.00
Feature Bsmt_Half_Bath -> Scale:    2.00
Feature Full_Bath -> Scale:    4.00
Feature Half_Bath -> Scale:    2.00
Feature Bedroom_AbvGr -> Scale:    8.00
Feature Kitchen_AbvGr -> Scale:    3.00
Feature Kitchen_Qual -> Scale:   

## Applico MinMaxScaler

In [80]:
scaler = MinMaxScaler()
scaler.fit(X_train)

best_accuracy = 0
best_k = 1
for k in range(1, 100):
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(scaler.transform(X_train), Y_train_binned.ravel())
    y_pred = kNN.predict(scaler.transform(X_test))

    # compute Accuracy
    acc = accuracy_score(y_true=Y_test_binned, y_pred=y_pred)
    best_accuracy = acc if acc > best_accuracy else best_accuracy
    best_k = k if best_accuracy == acc else best_k
    print("k: {:2d} | Accuracy {:.3f}".format(k, acc))

print("\nBest k: {:2d} | Accuracy {:.3f}".format(best_k, best_accuracy))

k:  1 | Accuracy 0.520
k:  2 | Accuracy 0.491
k:  3 | Accuracy 0.529
k:  4 | Accuracy 0.532
k:  5 | Accuracy 0.553
k:  6 | Accuracy 0.548
k:  7 | Accuracy 0.551
k:  8 | Accuracy 0.546
k:  9 | Accuracy 0.563
k: 10 | Accuracy 0.555
k: 11 | Accuracy 0.549
k: 12 | Accuracy 0.529
k: 13 | Accuracy 0.543
k: 14 | Accuracy 0.534
k: 15 | Accuracy 0.527
k: 16 | Accuracy 0.536
k: 17 | Accuracy 0.541
k: 18 | Accuracy 0.539
k: 19 | Accuracy 0.534
k: 20 | Accuracy 0.532
k: 21 | Accuracy 0.536
k: 22 | Accuracy 0.526
k: 23 | Accuracy 0.539
k: 24 | Accuracy 0.532
k: 25 | Accuracy 0.536
k: 26 | Accuracy 0.527
k: 27 | Accuracy 0.526
k: 28 | Accuracy 0.531
k: 29 | Accuracy 0.529
k: 30 | Accuracy 0.520
k: 31 | Accuracy 0.526
k: 32 | Accuracy 0.531
k: 33 | Accuracy 0.529
k: 34 | Accuracy 0.526
k: 35 | Accuracy 0.532
k: 36 | Accuracy 0.532
k: 37 | Accuracy 0.532
k: 38 | Accuracy 0.529
k: 39 | Accuracy 0.526
k: 40 | Accuracy 0.531
k: 41 | Accuracy 0.534
k: 42 | Accuracy 0.538
k: 43 | Accuracy 0.532
k: 44 | Acc

In [81]:
scaler = StandardScaler()
scaler.fit(X_train)

best_accuracy = 0
best_k = 1
for k in range(1, 100):
    kNN = KNeighborsClassifier(n_neighbors=k)
    kNN.fit(scaler.transform(X_train), Y_train_binned.ravel())
    y_pred = kNN.predict(scaler.transform(X_test))

    # compute Accuracy
    acc = accuracy_score(y_true=Y_test_binned, y_pred=y_pred)
    best_accuracy = acc if acc > best_accuracy else best_accuracy
    best_k = k if best_accuracy == acc else best_k
    print("k: {:2d} | Accuracy {:.3f}".format(k, acc))

print("\nBest k: {:2d} | Accuracy {:.3f}".format(best_k, best_accuracy))

k:  1 | Accuracy 0.531
k:  2 | Accuracy 0.503
k:  3 | Accuracy 0.546
k:  4 | Accuracy 0.549
k:  5 | Accuracy 0.570
k:  6 | Accuracy 0.555
k:  7 | Accuracy 0.570
k:  8 | Accuracy 0.561
k:  9 | Accuracy 0.565
k: 10 | Accuracy 0.563
k: 11 | Accuracy 0.549
k: 12 | Accuracy 0.548
k: 13 | Accuracy 0.544
k: 14 | Accuracy 0.544
k: 15 | Accuracy 0.541
k: 16 | Accuracy 0.560
k: 17 | Accuracy 0.558
k: 18 | Accuracy 0.553
k: 19 | Accuracy 0.549
k: 20 | Accuracy 0.549
k: 21 | Accuracy 0.549
k: 22 | Accuracy 0.539
k: 23 | Accuracy 0.536
k: 24 | Accuracy 0.555
k: 25 | Accuracy 0.543
k: 26 | Accuracy 0.543
k: 27 | Accuracy 0.539
k: 28 | Accuracy 0.543
k: 29 | Accuracy 0.546
k: 30 | Accuracy 0.546
k: 31 | Accuracy 0.549
k: 32 | Accuracy 0.553
k: 33 | Accuracy 0.560
k: 34 | Accuracy 0.546
k: 35 | Accuracy 0.556
k: 36 | Accuracy 0.544
k: 37 | Accuracy 0.546
k: 38 | Accuracy 0.549
k: 39 | Accuracy 0.546
k: 40 | Accuracy 0.546
k: 41 | Accuracy 0.544
k: 42 | Accuracy 0.546
k: 43 | Accuracy 0.549
k: 44 | Acc