In [337]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from queue import PriorityQueue
import heapq
import statistics
from collections import Counter

In [338]:
data = pd.read_csv("kendaraan_clean_PCA_original.csv")
# data = data.sample(10000)

In [339]:
X = data.drop(columns=["Tertarik"])
y = data[["Tertarik"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [340]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(227225, 2) (56807, 2) (227225, 1) (56807, 1)


In [341]:
class KDNode:
    def __init__(self, points, y, left = None, right = None, distance = np.inf):
        self.points = points
        self.y = y
        self.left = left
        self.right = right
        self.distance = distance
    def __lt__(self, other):
        return self.distance < other.distance

In [342]:
class KDTree:
    def __init__(self, k = 2, p = 0):
        self.tree = None
        self.k = k
        self.depth = 0
        self.heap = []
    
    def fit(self, X, y):
        self.X = X
        self.y = y

        if isinstance(X, pd.DataFrame):
            X = X.values
        
        if isinstance(y, pd.DataFrame):
            y = y.values

        self.tree = self._construct_tree(np.array(X), np.array(y).reshape(-1), 0)
    
    def _construct_tree(self, points, y, depth):
        if len(points) == 0:
            return None
        
        k = len(points[0])
        axis = depth % k
        
        sort_by_axis = np.argsort(points[:, axis])
        sorted_points = points[sort_by_axis]
        sorted_y =  y[sort_by_axis]
        mid = len(sorted_points) // 2

        return KDNode(
            sorted_points[mid],
            sorted_y[mid],
            self._construct_tree(sorted_points[:mid], sorted_y[:mid], depth + 1),
            self._construct_tree(sorted_points[mid + 1:],sorted_y[mid + 1:], depth + 1)
        )
  
    def nearest_neighbour_search(self, query_point):
        k = len(query_point)
        heap = []
        queue = PriorityQueue()
        
        def search(node, depth):
            if node == None:
                return

            nonlocal heap
            nonlocal queue

            # scanned_nodes.append(node)
            
            d = np.linalg.norm(query_point - node.points, or)
            node.distance = -d
            # queue.put(node)
            heapq.heappush(heap, node)
            if len(heap) < self.k:
                heapq.heappush(heap, node)
            else:
                heapq.heappushpop(heap, node)

            axis = depth % k

            if query_point[axis] < node.points[axis]:
                close, other = node.left, node.right
            else:
                close, other = node.right, node.left

            search(close, depth + 1)

            delta = abs(query_point[axis] - node.points[axis])
            # nearest = queue.queue[0].distance
            nearest = abs(heap[-1].distance)
            # isFull = not queue.full()
            isFull = len(heap) > self.k
            if len(heap) < self.k or delta < nearest:
                search(other, depth + 1)

        # search(self.tree, 0)
        search(self.tree, 0)
        # nds = heapq.nlargest(self.k, self.heap)
        # neighbours = []
        # for _ in range(self.k):
            # neighbours.append(heapq.heappop(heap))
            # neighbours.append(queue.get())
        return heap

    def predict(self, X_test):
        results = []
        if isinstance(X_test, pd.DataFrame):
            X_test = X_test.values
            
        for test in X_test:
            result = self.nearest_neighbour_search(test)
            predict_values = []
            for item in result:
                predict_values.append(item.y)
            # predict_values = [item for item in result]
            # print(result)
            counter = Counter(predict_values)
            results.append(counter.most_common(1)[0][0])
            # results.append(result.y)
        return results

In [343]:
kdtree = KDTree(k = 2)
kdtree.fit(X_train, y_train)

In [344]:
y_pred = kdtree.predict(X_test)

In [345]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

precision = accuracy_score(y_pred, y_test) * 100
class_report = classification_report(y_pred, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision))
print(class_report)
print(confusion_matrix(y_pred, y_test))

Accuracy with K-NN: 86.96%
              precision    recall  f1-score   support

           0       0.98      0.88      0.93     55381
           1       0.07      0.35      0.12      1426

    accuracy                           0.87     56807
   macro avg       0.53      0.62      0.52     56807
weighted avg       0.96      0.87      0.91     56807

[[48895  6486]
 [  920   506]]


In [348]:
# df_test = pd.read_csv("kendaraan_clean_pca_test.csv")
# y_pred_2_scratch = kdtree.predict(df_test.drop(columns=["Tertarik"]))

In [354]:
# precision3 = accuracy_score(y_pred_2_scratch, df_test[["Tertarik"]]) * 100
# class_report3 = classification_report(y_pred_2_scratch, df_test[["Tertarik"]])
# print("Accuracy with K-NN: {0:.2f}%".format(precision3))
# print(class_report3)
# print(confusion_matrix(y_pred_2_scratch, df_test[["Tertarik"]]))

In [355]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2, p=2, algorithm='kd_tree')
knn.fit(X_train, y_train)
y_pred_1 = knn.predict(X_test)

  return self._fit(X, y)


In [356]:
precision1 = accuracy_score(y_pred_1, y_test) * 100
class_report = classification_report(y_pred_1, y_test)
print("Accuracy with K-NN: {0:.2f}%".format(precision1))
print(class_report)
print(confusion_matrix(y_pred_1, y_test))

Accuracy with K-NN: 86.44%
              precision    recall  f1-score   support

           0       0.97      0.88      0.93     54881
           1       0.09      0.32      0.14      1926

    accuracy                           0.86     56807
   macro avg       0.53      0.60      0.53     56807
weighted avg       0.94      0.86      0.90     56807

[[48497  6384]
 [ 1318   608]]


In [352]:
df_test = pd.read_csv("kendaraan_clean_pca_test.csv")
y_pred_2 = knn.predict(df_test.drop(columns=["Tertarik"]))

In [353]:
precision2 = accuracy_score(y_pred_2, df_test[["Tertarik"]]) * 100
class_report2 = classification_report(y_pred_2, df_test[["Tertarik"]])
print("Accuracy with K-NN: {0:.2f}%".format(precision2))
print(class_report2)
print(confusion_matrix(y_pred_2, df_test[["Tertarik"]]))

Accuracy with K-NN: 50.88%
              precision    recall  f1-score   support

           0       0.99      0.50      0.67    489139
           1       0.03      0.73      0.05      9433

    accuracy                           0.51    498572
   macro avg       0.51      0.62      0.36    498572
weighted avg       0.97      0.51      0.66    498572

[[246766 242373]
 [  2520   6913]]
