In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from collections import Counter

In [2]:
iris = datasets.load_iris()
X = iris['data']
y = iris['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [3]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [4]:
def euclidean_distance(point, data):
    return np.sqrt(np.sum(point - data)** 2)

In [6]:
class KNNClassifier:
    def __init__(self,k=5):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self,X,y):
        self.X_train = X
        self.y_train = y
    
    def _compute_distances(self, X_test):
        distances = []
        n_test = X_test.shape[0]
        n_train = self.X_train.shape[0]
        distances = np.zeros((n_test, n_train))
        for i in range(n_test):
            for j in range(n_train):
                distances[i,j] = np.sqrt(np.sum((X_test[i] - self.X_train[j]) ** 2))
        return distances

    def predict(self, X_test):
        distances = self._compute_distances(X_test)
        k_neighbours_indices = np.argsort(distances, axis=1)[:, :self.k]
        k_neighbours_labels = self.y_train[k_neighbours_indices]
        predictions = [Counter(neighbours).most_common(1)[0][0] for neighbours in k_neighbours_labels]
        return np.array(predictions)

    def score(self, X_test, y_test):
        predictions = self.predict(X_test)
        accuracy = np.mean(predictions == y_test)
        return accuracy
       

In [None]:
knn_classifier = KNNClassifier()
knn_classifier.fit(X=X_train,y=y_train)
knn_classifier.score(X_test=X_test, y_test=y_test)

In [None]:
from sklearn import neighbors

knn_classifier_sklearn = neighbors.KNeighborsClassifier()
knn_classifier_sklearn.fit(X=X_train, y=y_train)
knn_classifier_sklearn.score(X=X_test, y=y_test)
knn_classifier_sklearn_labels = knn_classifier_sklearn.predict(X_test)

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

breast_cancer = datasets.load_breast_cancer()
breast_cancer

In [12]:
X = breast_cancer['data']
y = breast_cancer['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [None]:
from algorithms import CustomKNNClassifier as KNNClassifier

knn_classifier = KNNClassifier(k=3)
knn_classifier.fit(X=X_train, y=y_train)
y_pred = knn_classifier.predict(X_test=X_test)

In [None]:
knn_classifier.score(X_test=X_test, y_test=y_test)

In [14]:
import pandas as pd
from sklearn.metrics import confusion_matrix

In [None]:
y_test, y_pred

In [16]:
confusion_matrix = confusion_matrix(y_test, y_pred)

In [None]:
confusion_matrix

In [18]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)

In [None]:
gnb.score(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_gnb)

In [26]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

dt_classifier = DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(X_train,y_train)
y_pred_dt_classifier = dt_classifier.predict(X_test)

In [None]:
plot_tree(dt_classifier)

In [None]:
y_test, y_pred_dt_classifier

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

from ucimlrepo import fetch_ucirepo

# Models Import
from algorithms import CustomKNNClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# KNN Classifier pre-requisites

k = 5
metric = 'euclidean'

In [3]:
dataset_dict = {
    0 : {
        'label' : 'd1',
        'name' : 'Breast Cancer',
        'data' : fetch_ucirepo(id=17)
    },
    1 : {
        'label' : 'd2',
        'name' : 'Wine Quality',
        'data' : fetch_ucirepo(id=186)
    },
    2 : {
        'label' : 'd3',
        'name' : 'iris',
        'data' : fetch_ucirepo(id=53)
    }
}

classifiers = {
    0 : {
        'label' : 'c1',
        'name' : 'CustomKNNClassifier',
        'model' : CustomKNNClassifier(k=k, metric=metric),
    },
    1 : {
        'label' : 'c2',
        'name' : 'KNeighboursClassifier',
        'model' : KNeighborsClassifier(n_neighbors=k, metric=metric)
    },
    2 : {
        'label' : 'c3',
        'name' : 'GaussianNB',
        'model' : GaussianNB()
    }
}

In [4]:
from sklearn import datasets

test_data = datasets.load_breast_cancer()
X = test_data['data']
y = test_data['target']

X[0], y[0]

(array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01]),
 0)

In [5]:
for _, dataset in dataset_dict.items():
    X = dataset['data'].data.features.to_numpy()
    y = dataset['data'].data.targets.to_numpy().flatten()

    # print(X[0], y[0])
    # print(type(X), type(y))
    # print(X.shape, y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    for _ , classifer in classifiers.items():

        model = classifer['model']

        start_train = time.time()
        model = model.fit(X_train, y_train)
        train_time = time.time() - start_train

        start_predict = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_predict

        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
        conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
        
        df = pd.DataFrame(conf_matrix,
                          index=[f'Actual_{i}' for i in range(conf_matrix.shape[0])],
                          columns=[f'Predicted_{i}' for i in range(conf_matrix.shape[1])])
        
        df.to_csv(f'results/g018_{dataset['label']}_{classifer['label']}.csv')

        print('Model - ', classifer['name'], ' | Dataset - ', dataset['name'])
        print('Accuracy - ', accuracy)
        print('Training time - ', train_time)
        print('Prediction time - ', predict_time)
        print(f'Confusion Matrix - ', conf_matrix)
        print('-' * 50)

Model -  CustomKNNClassifier  | Dataset -  Breast Cancer
Accuracy -  0.8859649122807017
Training time -  2.1457672119140625e-06
Prediction time -  0.1716759204864502
Confusion Matrix -  [[66  2]
 [11 35]]
--------------------------------------------------
Model -  KNeighboursClassifier  | Dataset -  Breast Cancer
Accuracy -  0.8859649122807017
Training time -  0.0004019737243652344
Prediction time -  0.027029037475585938
Confusion Matrix -  [[66  2]
 [11 35]]
--------------------------------------------------
Model -  GaussianNB  | Dataset -  Breast Cancer
Accuracy -  0.9210526315789473
Training time -  0.0005440711975097656
Prediction time -  9.393692016601562e-05
Confusion Matrix -  [[67  1]
 [ 8 38]]
--------------------------------------------------
Model -  CustomKNNClassifier  | Dataset -  Wine Quality
Accuracy -  0.5007692307692307
Training time -  4.76837158203125e-06
Prediction time -  15.92990517616272
Confusion Matrix -  [[  0   0   4   3   0   0]
 [  0   1  21  17   2   0]
