In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import heapq
import statistics

In [5]:
data = pd.read_csv("kendaraan_clean_full.csv")

In [6]:
X = data.drop(columns=["Tertarik"])
y = data[["Tertarik"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(227225, 14) (56807, 14) (227225, 1) (56807, 1)


In [8]:
class KDNode:
    def __init__(self, points, y, left = None, right = None, distance = np.inf):
        self.points = points
        self.y = y
        self.left = left
        self.right = right
        self.distance = distance
    def __lt__(self, other):
        return self.distance < other.distance

In [15]:
class KDTree:
    def __init__(self, k = 2):
        self.tree = None
        self.k = k
        self.depth = 0
        self.heap = []
    
    def fit(self, X, y):
        self.X = X
        self.y = y

        if isinstance(X, pd.DataFrame):
            X = X.values
        
        if isinstance(y, pd.DataFrame):
            y = y.values

        self.tree = self._construct_tree(np.array(X), np.array(y).reshape(-1), 0)
    
    def _construct_tree(self, points, y, depth):
        if len(points) == 0:
            return None
        
        k = len(points[0])
        axis = depth % k
        
        sort_by_axis = np.argsort(points[:, axis])
        sorted_points = points[sort_by_axis]
        sorted_y =  y[sort_by_axis]
        mid = len(sorted_points) // 2

        return KDNode(
            sorted_points[mid],
            sorted_y[mid],
            self._construct_tree(sorted_points[:mid], sorted_y[:mid], depth + 1),
            self._construct_tree(sorted_points[mid + 1:],sorted_y[mid + 1:], depth + 1)
        )
    
    def _height(self, root):
        if root is None:
            return 0
        return max(self._height(root.left), self._height(root.right)) + 1

    def _isBalanced(self, root):
        if root is None:
            return True
        
        left_height = self._height(root.left)
        right_height = self._height(root.right)

        return abs(left_height - right_height) <= 1 and (self._isBalanced(root.left) and self._isBalanced(root.right) is True)

    def isBalanced(self):
        return self._isBalanced(self.tree)

    def height(self):
        return self._height(self.tree)

    def nearest_neighbour_search(self, query_point):
        k = len(query_point)
        heapq.heapify(self.heap)

        def search(tree, depth):
            if tree is None:
                return

            axis = depth % k
            diff = query_point[axis] - tree.points[axis]
            
            if diff <= 0:
                close, away = tree.left, tree.right
            else:
                close, away = tree.right, tree.left

            
            # d = euclidean(tree.points, query_point)
            d = np.linalg.norm(query_point - tree.points)
            tree.distance = -d
            
            search(close, depth + 1)

            if len(self.heap) < self.k:
                heapq.heappush(self.heap, tree)
            else:
                if heapq.nsmallest(1, self.heap)[0].distance < -d:
                    heapq.heapreplace(self.heap, tree)

            if len(self.heap) < self.k or abs(heapq.nsmallest(1, self.heap)[0].distance) > abs(d):
                search(away, depth + 1)

        search(self.tree, 0)
        nds = heapq.nsmallest(self.k, self.heap)
        return nds

    def predict(self, X_test):
        results = []
        if isinstance(X_test, pd.DataFrame):
            X_test = X_test.values
            
        for test in X_test:
            result = self.nearest_neighbour_search(test)
            predict_values = [x.y for x in result]
            results.append(statistics.mode(predict_values))
        return results

In [19]:
kdtree = KDTree(k = 5)
kdtree.fit(X_train, y_train)

In [20]:
y_pred = kdtree.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

precision = accuracy_score(y_pred, y_test) * 100
class_report = classification_report(y_pred, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision))
print(class_report)
print(confusion_matrix(y_pred, y_test))

Accuracy with K-NN: 86.73%
              precision    recall  f1-score   support

           0       0.99      0.88      0.93     56056
           1       0.01      0.13      0.03       751

    accuracy                           0.87     56807
   macro avg       0.50      0.51      0.48     56807
weighted avg       0.97      0.87      0.92     56807

[[49165  6891]
 [  650   101]]


In [22]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_pred_1 = knn.predict(X_test)

  return self._fit(X, y)


In [None]:
precision1 = accuracy_score(y_pred_1, y_test) * 100
class_report = classification_report(y_pred_1, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision1))
print(class_report)
print(confusion_matrix(y_pred_1, y_test))

Accuracy with K-NN: 84.68%
              precision    recall  f1-score   support

           0       0.94      0.89      0.91     52204
           1       0.21      0.31      0.25      4603

    accuracy                           0.85     56807
   macro avg       0.57      0.60      0.58     56807
weighted avg       0.88      0.85      0.86     56807

[[46659  5545]
 [ 3156  1447]]
