In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import heapq
import statistics

In [None]:
data = pd.read_csv("kendaraan_clean_pca.csv")

In [None]:
X = data.drop(columns=["Tertarik"])
y = data[["Tertarik"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(398857, 2) (99715, 2) (398857, 1) (99715, 1)


In [40]:
class KDNode:
    def __init__(self, points, y, left = None, right = None, distance = np.inf):
        self.points = points
        self.y = y
        self.left = left
        self.right = right
        self.distance = distance
    def __lt__(self, other):
        return self.distance < other.distance

In [44]:
class KDTree:
    def __init__(self, k = 2):
        self.tree = None
        self.k = k
        self.depth = 0
        self.heap = []
    
    def fit(self, X, y):
        self.X = X
        self.y = y

        if isinstance(X, pd.DataFrame):
            X = X.values
        
        if isinstance(y, pd.DataFrame):
            y = y.values

        self.tree = self._construct_tree(np.array(X), np.array(y).reshape(-1), 0)
    
    def _construct_tree(self, points, y, depth):
        if len(points) == 0:
            return None
        
        k = len(points[0])
        axis = depth % k
        
        sort_by_axis = np.argsort(points[:, axis])
        sorted_points = points[sort_by_axis]
        sorted_y =  y[sort_by_axis]
        mid = len(sorted_points) // 2

        return KDNode(
            sorted_points[mid],
            sorted_y[mid],
            self._construct_tree(sorted_points[:mid], sorted_y[:mid], depth + 1),
            self._construct_tree(sorted_points[mid + 1:],sorted_y[mid + 1:], depth + 1)
        )
    
    def _height(self, root):
        if root is None:
            return 0
        return max(self._height(root.left), self._height(root.right)) + 1

    def _isBalanced(self, root):
        if root is None:
            return True
        
        left_height = self._height(root.left)
        right_height = self._height(root.right)

        return abs(left_height - right_height) <= 1 and (self._isBalanced(root.left) and self._isBalanced(root.right) is True)

    def isBalanced(self):
        return self._isBalanced(self.tree)

    def height(self):
        return self._height(self.tree)

    def nearest_neighbour_search(self, query_point):
        k = len(query_point)
        # heapq.heapify(self.heap)
        best = None

        def search(tree, depth):
            if tree is None:
                return

            nonlocal best


            
            # d = euclidean(tree.points, query_point)
            d = np.linalg.norm(query_point - tree.points)
            tree.distance = d

            if best is None or d < best.distance:
                best = KDNode(tree.points, tree.y, distance=d)
            
            axis = depth % k
            diff = query_point[axis] - tree.points[axis]
            
            if diff <= 0:
                close, away = tree.left, tree.right
            else:
                close, away = tree.right, tree.left

            search(close, depth + 1)

            # if len(self.heap) < self.k:
            #     heapq.heappush(self.heap, tree)
            # else:
            #     if heapq.nsmallest(1, self.heap)[0].distance < -d:
            #         heapq.heapreplace(self.heap, tree)

            if diff * diff < best.distance:
                search(away, depth + 1)

        search(self.tree, 0)
        # nds = heapq.nlargest(self.k, self.heap)
        return best

    def predict(self, X_test):
        results = []
        if isinstance(X_test, pd.DataFrame):
            X_test = X_test.values
            
        for test in X_test:
            result = self.nearest_neighbour_search(test)
            # predict_values = [x.y for x in result]
            # results.append(statistics.mode(predict_values))
            results.append(result.y)
        return results

In [45]:
kdtree = KDTree(k = 1)
kdtree.fit(X_train, y_train)

In [46]:
y_pred = kdtree.predict(X_test)

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

precision = accuracy_score(y_pred, y_test) * 100
class_report = classification_report(y_pred, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision))
print(class_report)
print(confusion_matrix(y_pred, y_test))

Accuracy with K-NN: 94.04%
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     44056
           1       1.00      0.90      0.94     55659

    accuracy                           0.94     99715
   macro avg       0.94      0.95      0.94     99715
weighted avg       0.95      0.94      0.94     99715

[[43957    99]
 [ 5844 49815]]


In [49]:
df_test = pd.read_csv("kendaraan_clean_pca_test.csv")
y_pred_2_scratch = kdtree.predict(df_test.drop(columns=["Tertarik"]))

In [50]:
precision3 = accuracy_score(y_pred_2_scratch, df_test[["Tertarik"]]) * 100
class_report3 = classification_report(y_pred_2_scratch, df_test[["Tertarik"]])
print("Accuracy with K-NN: {0:.2f}%".format(precision3))
print(class_report3)
print(confusion_matrix(y_pred_2_scratch, df_test[["Tertarik"]]))

Accuracy with K-NN: 98.80%
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    243532
           1       1.00      0.98      0.99    255040

    accuracy                           0.99    498572
   macro avg       0.99      0.99      0.99    498572
weighted avg       0.99      0.99      0.99    498572

[[243421    111]
 [  5865 249175]]


In [51]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1, p=2, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_pred_1 = knn.predict(X_test)

  return self._fit(X, y)


In [52]:
precision1 = accuracy_score(y_pred_1, y_test) * 100
class_report = classification_report(y_pred_1, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision1))
print(class_report)
print(confusion_matrix(y_pred_1, y_test))

Accuracy with K-NN: 94.04%
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     44059
           1       1.00      0.90      0.94     55656

    accuracy                           0.94     99715
   macro avg       0.94      0.95      0.94     99715
weighted avg       0.95      0.94      0.94     99715

[[43959   100]
 [ 5842 49814]]


In [53]:
df_test = pd.read_csv("kendaraan_clean_pca_test.csv")
y_pred_2 = knn.predict(df_test.drop(columns=["Tertarik"]))

In [54]:
precision2 = accuracy_score(y_pred_2, df_test[["Tertarik"]]) * 100
class_report2 = classification_report(y_pred_2, df_test[["Tertarik"]])
print("Accuracy with K-NN: {0:.2f}%".format(precision2))
print(class_report2)
print(confusion_matrix(y_pred_2, df_test[["Tertarik"]]))

Accuracy with K-NN: 98.80%
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    243552
           1       1.00      0.98      0.99    255020

    accuracy                           0.99    498572
   macro avg       0.99      0.99      0.99    498572
weighted avg       0.99      0.99      0.99    498572

[[243426    126]
 [  5860 249160]]
