# k-Nearest Neighbors
Modello predittivo utilizzato per classificazione e regressione. 
Utilizza un concetto di vicinanza di caratteristiche per predirre il comportamento o la classificazione di un soggetto a seconda di quelli a lui più vicini, vale a dire più simili

In [14]:
from typing import List, NamedTuple
import math 
from typing import List
from collections import Counter

def raw_majority_vote(labels: List[str]) -> str:
    votes = Counter(labels)
    winner, _ = votes.most_common(1)[0]
    return winner

assert raw_majority_vote(['a', 'b', 'c', 'b']) == 'b'

def majority_vote(labels: List[str]) -> str:
    """Assumes that labels are ordered from nearest to farthest."""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count
                       for count in vote_counts.values()
                       if count == winner_count])

    if num_winners == 1:
        return winner                     # unique winner, so return it
    else:
        return majority_vote(labels[:-1]) # try again without the farthest

In [None]:
from typing import NamedTuple
from linear_algebra import Vector, distance

class LabeledPoint(NamedTuple):
    point: Vector
    label: str

def knn_classify(k: int,
                 labeled_points: List[LabeledPoint],
                 new_point: Vector) -> str:

    # Order the labeled points from nearest to farthest.
    by_distance = sorted(labeled_points,
                         key=lambda lp: distance(lp.point, new_point))

    # Find the labels for the k closest
    k_nearest_labels = [lp.label for lp in by_distance[:k]]

    # and let them vote.
    return majority_vote(k_nearest_labels)


import random

def random_point(dim: int) -> Vector:
    return [random.random() for _ in range(dim)]

def random_distances(dim: int, num_pairs: int) -> List[float]:
    return [distance(random_point(dim), random_point(dim))
            for _ in range(num_pairs)]

# Caso studio: Iris dataset
**Approccio 1: "From Scratch"**




*   Import del dataset dal modulo sklearn




In [16]:
from sklearn.datasets import load_iris
iris_data = load_iris()

In [17]:
import requests
from typing import Dict
import csv
from collections import defaultdict

data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")

def parse_iris_row(row):
        """
        sepal_length, sepal_width, petal_length, petal_width, class
        """
        measurements = [float(value) for value in row[:-1]]
        # class is e.g. "Iris-virginica"; we just want "virginica"
        label = row[:-1]
    
        return LabeledPoint(measurements, label)
    
with open('iris.dat') as f:
        reader = csv.reader(f)
        iris_data = [parse_iris_row(row) for row in reader]

FileNotFoundError: ignored



*   Ottenimento variabili indipendenti X e variabili da predirre y



In [19]:
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results
iris_train, iris_test = split_data(iris_data, 0.70)

In [18]:
confusion_matrix = {}
num_correct = 0
    
for iris in iris_test:
  predicted = knn_classify(5, iris_train, iris.point)
  actual = iris.label
    
  if predicted == actual:
      num_correct += 1
    
confusion_matrix[(predicted, actual)] += 1
    
pct_correct = num_correct / len(iris_test)
print(pct_correct, confusion_matrix)

NameError: ignored

**Approccio 2: Scikit-learn**

In [None]:
from sklearn.datasets import load_iris
iris_data = load_iris()



*   Scomponimento del dataset in variabili X indipendenti e classi y da predire



In [None]:
iris_X = pd.DataFrame(iris_data["data"], columns=iris_data["feature_names"])
iris_y = pd.Series(iris_data["target_names"][iris_data["target"]])



*   Divisione in trainig set e test set



In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size = 1/3)

Scikit-learn fornisce *KNeighborsClassifier*, un classificatore che implementa l'algoritmo k-nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(X_train, y_train)
neigh.score(X_test, y_test)

0.94



*   Con una grid search possono essere trovati gli iperparametri ideali. Gli iperparametri principali per questo modello sono il numero di vicini da utilizzare e lo specifico algoritmo per computare i vicini



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
skf = StratifiedKFold(3, shuffle=True)
model = Pipeline([
    ("neigh", KNeighborsClassifier())
])

grid = {
    "neigh__n_neighbors": range(1, 11),
    "neigh__algorithm": ["ball_tree", "kd_tree", "brute"]
}
gs = GridSearchCV(model, grid, cv=skf)
gs.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('neigh',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'neigh__algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'neigh__n_neighbors': range(1, 11)},
          

In [None]:
gs.score(X_test, y_test)

{'neigh__algorithm': 'ball_tree', 'neigh__n_neighbors': 6}

In [None]:
gs.best_params_