<a href="https://colab.research.google.com/github/g40rgeLE/ml_from_scratch/blob/main/KNNClf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
class MyKNNClf:

    def __init__(self,
                 k: int = 3,
                 metric: str = 'euclidean',
                 weight: str = 'uniform'):
        self.k = k
        self.metric = metric
        self.weight = weight

        self.train_size = None

    def __str__(self):
        params = [f'{key}={value}' for key, value in self.__dict__.items()]
        return 'MyKNNClf class: ' + ' '.join(params)

    def __repr__(self):
        params = [f'{key}={value}' for key, value in self.__dict__.items()]
        return 'MyKNNClf class: ' + ' '.join(params)

    def fit(self, X: pd.DataFrame, y: pd.Series):
        self.X_train = X.copy(deep=True)
        self.y_train = y.copy(deep=True)
        self.train_size = X.shape

    def euclidean(self, row: pd.Series):
        return (self.X_train - row).pow(2).sum(axis=1).pow(.5)

    def chebyshev(self, row: pd.Series):
        return (self.X_train - row).abs().max(axis=1)

    def manhattan(self, row: pd.Series):
        return (self.X_train - row).abs().sum(axis=1)

    def cosine(self, row: pd.Series):
        norms_train = self.X_train.pow(2).sum(axis=1).pow(.5)
        norm_row = np.sqrt(row.pow(2).sum())
        return 1 - (self.X_train @ row) / (norms_train * norm_row)

    def _uniform_vote(self, dist: pd.Series, proba=False):
        prob = self.y_train[dist.sort_values().head(self.k).index].mean()
        if proba:
            return prob
        else:
            return 1 if prob >= .5 else 0

    def _rank_vote(self, dist: pd.Series, proba=False):
        target_k = self.y_train[dist.sort_values().head(self.k).index].reset_index(drop=True)
        weights = (1 / (target_k.index + 1)).values.sum()
        weight_0 = (1 / (target_k[target_k == 0].index + 1)).values.sum()
        weight_1 = (1 / (target_k[target_k == 1].index + 1)).values.sum()
        q_0 = weight_0 / weights
        q_1 = weight_1 / weights

        if proba:
            return q_1
        else:
            return 1 if q_1 >= q_0 else 0

    def _distance_vote(self, dist: pd.Series, proba=False):
        target_k = self.y_train[dist.sort_values().head(self.k).index].reset_index(drop=True)
        dist_k = dist.sort_values().head(self.k).reset_index(drop=True)
        weights = (1 / dist_k).sum()
        weight_0 = (1 / dist_k[target_k == 0]).sum()
        weight_1 = (1 / dist_k[target_k == 1]).sum()
        q_0 = weight_0 / weights
        q_1 = weight_1 / weights

        if proba:
            return q_1
        else:
            return 1 if q_1 >= q_0 else 0

    def __predict_unit(self, row: pd.Series, proba=False):
        dist = getattr(self, self.metric)(row)

        return getattr(self, '_' + self.weight + '_vote')(dist, proba)

    def predict_proba(self, X: pd.DataFrame):
        return X.apply(self.__predict_unit, args=(True,), axis=1)

    def predict(self, X: pd.DataFrame):
        return X.apply(self.__predict_unit, args=(False,), axis=1)

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=100, n_features=5, n_informative=2)
X, y = pd.DataFrame(X), pd.Series(y)
X.columns = [f'col_{i}' for i in X.columns]
X.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,0.430942,-1.196676,-0.582052,1.113814,-0.945287
1,-1.458563,-0.5374,-0.42492,1.094116,0.62988
2,0.483484,1.131633,0.527506,-0.970071,1.041619
3,0.969672,-0.081629,0.148551,-0.607733,-1.278255
4,-1.666354,-0.614535,-0.229232,0.318947,-0.934646


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
clf = MyKNNClf()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_pred

83    0
53    0
70    0
45    0
44    1
39    1
22    0
80    1
10    1
0     0
18    0
30    1
73    1
33    1
90    1
4     0
76    0
77    0
12    0
31    1
55    1
88    1
26    0
42    0
69    0
15    0
40    1
96    0
9     1
72    1
dtype: int64

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)

In [None]:
score

0.9