In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.base import clone

In [7]:
def euclidean_distance(p, q):
    return np.sqrt(np.sum((np.array(p) - np.array(q)) ** 2))

def manhattan_distance(p, q):
    return np.sum(np.absolute(np.array(p) - np.array(q)))

def chebyshev_distance(p, q):
    return np.max(np.absolute(np.array(p) - np.array(q)))

def calculate_distance(metric, p, q):
    if metric == "manhattan":
        return manhattan_distance(p, q)
    elif metric == "chebyshev":
        return chebyshev_distance(p, q)

    return euclidean_distance(p, q)

class KNearestNeighbors(object):
    
    def __init__(self):
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    def fit(self, X, y, k=3, metric="manhattan"):
        self.k = k
        self.metric = metric
        self.points = self.encoder.fit_transform(X)
        self.labels = y

    def predict(self, new_points):
        prediction = []
        
        for new_point in new_points:
            distances = np.array([])
            encoded_point = self.encoder.transform(new_point)
            print(encoded_point)
    
            for point in self.points:
                distance = calculate_distance(self.metric, point, encoded_point)
                distances = np.append(distances, distance)
    
            min_indexes = distances.argsort()[:self.k]
    
            labels_count = {}
            for index in min_indexes:
                if not self.labels[index] in labels_count.keys():
                    labels_count[self.labels[index]] = 1
                else:
                    labels_count[self.labels[index]] += 1

            # to get max value from labels_count
            prediction.append(max(labels_count, key=labels_count.get)[0])
        return prediction

    def get_params(self, deep = False):
        return {}

In [3]:
def cross_validation(df, clf, label_field):
    clone_classifier = clone(clf)
    df_train, df_test = train_test_split(data, test_size=0.3, random_state=77)

    y_train = df_train[label_field].to_numpy()
    X_train = df_train.drop(label_field, axis=1).to_numpy()
    
    y_test = df_test[label_field].to_numpy()
    X_test = df_test.drop(label_field, axis=1).to_numpy()
    clone_classifier.fit(X_train, y_train)

    X_test_reshaped = [];
    for item in X_test:
        X_test_reshaped.append(np.array([item]))
    
    labels_predict = clone_classifier.predict(X_test_reshaped)
    n_correct = sum(labels_predict == y_test)
    return n_correct / len(labels_predict)

In [8]:
data = pd.read_csv('data/mushrooms.csv')
y_mush = data['class']
x_mush = data.drop("class", axis=1)

clf_mushrooms = KNearestNeighbors()
results = clf_mushrooms.fit(x_mush, y_mush)

# prediction = clf_mushrooms.predict([
#     [['f','f','n','f','n','f','c','n','k','e','e','s','s','w','w','p','w','o','p','k','y','u']],
#     [['b','y','w','t','l','f','c','b','n','e','c','s','s','w','w','p','w','o','p','n','s','m']],
#     [['x','y','w','t','p','f','c','n','p','e','e','s','s','w','w','p','w','o','p','k','v','g']]
# ])

# print(f"prediction = {prediction}")
cross_val = cross_validation(data, clf_mushrooms, "class")
print(f"cross validation = {cross_val}")


[[0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1.

In [5]:
class KNearestNeighborsRegression(object):
    
    def __init__(self):
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    def fit(self, X, y, k=3, metric="manhattan"):
        self.k = k
        self.metric = metric
        self.points = self.encoder.fit_transform(X)
        self.labels = y

    def predict(self, new_points):
        prediction = []
        
        for new_point in new_points:
            distances = np.array([])
            print(new_point)
            encoded_point = self.encoder.transform(new_point)
    
            for point in self.points:
                distance = calculate_distance(self.metric, point, encoded_point)
                distances = np.append(distances, distance)
    
            min_indexes = distances.argsort()[:self.k]
    
            total = 0
            for index in min_indexes:
                print(self.labels[index])
                total += self.labels[index]

            average = total / self.k
            print(average)

            # to get max value from labels_count
            prediction.append(average)
        return prediction

    def get_params(self, deep = False):
         return {}


In [6]:
data = pd.read_csv('data/cars.csv')
data = data.drop("Car_ID", axis=1)
y_cars = data['Price']
x_cars = data.drop("Price", axis=1)

clf_cars = KNearestNeighborsRegression()
results = clf_cars.fit(x_cars, y_cars)

# print(f"prediction = {prediction}")
cross_val = cross_validation(data, clf_cars, "Price")
print(f"Accuracy = {cross_val} (no sense to use accuracy in such way, it's wrong)")

[['Tata' 'Nexon' 2019 35000 'Petrol' 'Manual' 'First' 17 1198 108 5]]
700000
650000
700000
683333.3333333334
[['Audi' 'A5' 2018 28000 'Diesel' 'Automatic' 'First' 17 1968 187 5]]
3200000
2400000
1600000
2400000.0
[['Hyundai' 'Santro' 2019 26000 'Petrol' 'Manual' 'Third' 20 1086 68 5]]
450000
600000
750000
600000.0
[['Toyota' 'Innova Crysta' 2017 38000 'Diesel' 'Manual' 'Second' 13 2755
  171 7]]
1400000
2500000
1500000
1800000.0
[['Volkswagen' 'Tiguan' 2018 32000 'Diesel' 'Automatic' 'First' 17 1968
  141 5]]
1600000
2400000
2300000
2100000.0
[['Tata' 'Tigor' 2018 42000 'Diesel' 'Manual' 'First' 24 1047 69 5]]
1300000
1300000
750000
1116666.6666666667
[['Maruti' 'S-Cross' 2020 15000 'Petrol' 'Automatic' 'Second' 18 1462 103
  5]]
700000
2800000
700000
1400000.0
[['Mahindra' 'XUV300' 2019 26000 'Diesel' 'Manual' 'Second' 20 1497 115 5]]
2700000
850000
600000
1383333.3333333333
[['Ford' 'Mustang' 2019 22000 'Petrol' 'Automatic' 'First' 13 2261 396 4]]
2700000
2500000
2900000
2700000.0
[[