# Lab03 - Build Your Own Classifier
We start by importing 3 datasets:
- Iris dataset
- MNIST dataset
- Ames Housing

In [1]:
!curl -L https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -o iris.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4551    0  4551    0     0   8121      0 --:--:-- --:--:-- --:--:--  8112


In [2]:
!curl -L https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv -o mnist_test.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17.4M  100 17.4M    0     0  1256k      0  0:00:14  0:00:14 --:--:-- 1460k


In [3]:
!pip install kaggle
!kaggle datasets download shashanknecrothapa/ames-housing-dataset
!unzip ames-housing-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/shashanknecrothapa/ames-housing-dataset
License(s): unknown
Downloading ames-housing-dataset.zip to /Users/giorgiozoccatelli/Documents/DSML-LAB/Lab03
  0%|                                                | 0.00/185k [00:00<?, ?B/s]
100%|█████████████████████████████████████████| 185k/185k [00:00<00:00, 283MB/s]
Archive:  ames-housing-dataset.zip
  inflating: AmesHousing.csv         


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## KNN Design and Implementation

In [8]:
colonne = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df_iris = pd.read_csv("iris.csv", sep=",", names=colonne, header=None)
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
df_test = df_iris.sample(frac=0.2, random_state=42)  
X_test = df_test[colonne[:-1]].values  
y_test = df_test[colonne[-1]].values  

indici_test = df_test.index
df_train = df_iris.drop(indici_test)
X_train = df_train[colonne[:-1]].values
y_train = df_train[colonne[-1]].values

In [28]:
def euclidean_distance(p, q):
    return np.sqrt(np.sum((p - q) ** 2))

def cosine_distance(p, q):
    dot_product = np.sum(p * q)
    norm_p = np.sqrt(np.sum(p ** 2))
    norm_q = np.sqrt(np.sum(q ** 2))
    cosine_similarity = dot_product / (norm_p * norm_q)
    return 1 - np.abs(cosine_similarity)

def manhattan_distance(p, q):
    return np.sum(np.abs(p - q))

class KNearestNeighbors:
    def __init__(self, k, distance_metric):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def _compute_distance(self, p, q):
        if self.distance_metric == "euclidean":
            return euclidean_distance(p, q)
        elif self.distance_metric == "cosine":
            return cosine_distance(p, q)
        elif self.distance_metric == "manhattan":
            return manhattan_distance(p, q)
        else:
            raise ValueError("Unsupported distance metric!")

    def predict(self, X):
        predictions = []
        for x in X:
            distances = np.array([self._compute_distance(x, train_x) for train_x in self.X_train])
            neighbors_idx = np.argsort(distances)[:self.k]
            neighbors_labels = self.y_train[neighbors_idx]
            
            unique_labels, counts = np.unique(neighbors_labels, return_counts=True)
            max_count = np.max(counts)
            
            candidates = unique_labels[counts == max_count]
    
            predictions.append(candidates[0])
        return np.array(predictions)

In [55]:
knn = KNearestNeighbors(k=50, distance_metric="euclidean")  
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

correct = np.sum(y_pred == y_test)
total = len(y_test)
accuracy = correct / total

print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 96.67%
