In [1]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
class KNN:
    def __init__(self, top_k=5) -> None:
        self.top_k = top_k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    @staticmethod
    def distance(x1, x2):
        return np.linalg.norm(x1 - x2)
    def predict_batch(self, X):
        y_pred = [self.predict(x) for x in X]
        return y_pred
    def predict(self, x):
        # Compute distance to all points in train set 
        distance = [self.distance(x, x_train) for x_train in self.X_train]
        # Sort the distance with index
        top_idx = np.argsort(distance)
        # Get top K label
        k_nearests = self.y_train[top_idx]
        # Predict the label
        label = Counter(k_nearests).most_common(1)[0][0]

        return label


In [3]:
import pickle 

with open('preprocessing/X.pkl', 'rb') as f:
    X = pickle.load(f)
    X = np.array(X)
with open('preprocessing/y.pkl', 'rb') as f:
    y = pickle.load(f)
    y = np.array(y)

print('So luong du lieu: ', len(X))
print('So luong nhan: ', len(y))
y.shape

So luong du lieu:  70000
So luong nhan:  70000


(70000,)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Load model
model = KNN(top_k=5)
model.fit(X_train, y_train)

# Sample predict model
y_pred = model.predict_batch(X_test)

# Caculate accuracy score
acc = accuracy_score(y_pred, y_test)

In [7]:
acc

0.8024285714285714

In [16]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=9)
# train
neigh.fit(X_train, y_train)
 
# predict
results = neigh.predict(X_test)

# Caculate accuracy score
acc = accuracy_score(results, y_test)

acc

0.791