# k-Nearest Neighbors Classifier (k-NN)

The k-NN is the simplest classifier of machine learning area. Differently of neural networks, we did not perform a **learning**, instead, the algorithm checks the distance between the object that will be classified and other feature vectors. Due to it is simplicity, it is so much used in benchmarks of complex classifiers, like: Artificial Neural Network (**ANN**) e Suport Vector Machine (**SVM**).

## Implementation

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import random

In [2]:
# Original dataset
dataset = pd.read_csv('datasets/new_dataset_covertype.csv')
dataset.head()

# Targets
target = dataset.iloc[:,-1]
# Dataset without classes
data   = dataset.iloc[:,:-1]
data.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,3254,75,7,365,49,3034,228,228,133,4708,...,0,0,0,0,0,0,0,0,0,1
1,3149,341,16,216,30,3241,186,215,167,3085,...,0,0,0,0,0,0,0,0,0,0
2,2972,321,10,150,13,4796,194,230,176,4607,...,0,0,0,0,0,0,0,0,0,0
3,3097,265,21,430,60,3290,162,244,218,1503,...,0,0,0,0,0,0,0,0,0,0
4,3321,286,7,660,118,797,201,240,179,968,...,0,1,0,0,0,0,0,0,0,0


In [3]:
# Read Selected Attributes of GA
ga  = pd.read_csv('results/genetic_algorithm.csv')

# GA dataset info
print('---------------- Genetic Algorithm ----------------')
print("Shape: ", ga.shape)
print("First Data:\n", ga.iloc[0])

---------------- Genetic Algorithm ----------------
Shape:  (2430, 59)
First Data:
 population             25
max_generations        50
combination_prob      0.7
mutation_prob        0.05
iteration               1
elevation           False
aspect              False
slope               False
horiz_dist_hydro    False
vert_dist_hydro     False
horiz_dist_road     False
hillshade_9         False
hill_shade_noon      True
hill_shade_15       False
horiz_dist_fire     False
wild_area_0          True
wild_area_1          True
wild_area_2          True
wild_area_3         False
soil_type_0         False
soil_type_1         False
soil_type_2         False
soil_type_3          True
soil_type_4         False
soil_type_5         False
soil_type_6         False
soil_type_7         False
soil_type_8          True
soil_type_9          True
soil_type_10        False
soil_type_11        False
soil_type_12        False
soil_type_13        False
soil_type_14        False
soil_type_15        False
soil_t

In [4]:
# Read Selected Attributes of PSO
pso = pd.read_csv('results/pso_selected_attributes.csv')

# PSO dataset info
print('---------------- Particle Swarm Optimization ----------------')
print("Shape: ", pso.shape)
print("First Data:\n", pso.iloc[0])

---------------- Particle Swarm Optimization ----------------
Shape:  (270, 57)
First Data:
 swarm_size                25
max_iterations            50
fitness             0.897974
elevation              False
aspect                  True
slope                  False
horiz_dist_hydro        True
vert_dist_hydro         True
horiz_dist_road         True
hillshade_9             True
hill_shade_noon         True
hill_shade_15          False
horiz_dist_fire        False
wild_area_0             True
wild_area_1             True
wild_area_2             True
wild_area_3             True
soil_type_0            False
soil_type_1             True
soil_type_2            False
soil_type_3            False
soil_type_4            False
soil_type_5             True
soil_type_6            False
soil_type_7            False
soil_type_8             True
soil_type_9             True
soil_type_10           False
soil_type_11            True
soil_type_12           False
soil_type_13           False
soil_typ

In [5]:
# Defining K's
ks = [7,10,15,20,25]
acc_list = ['k7','k10','k15','k20','k25']

def perform_knn(data, attr):
    columns = attr.columns.tolist()[:-54] + acc_list
    new_df = pd.DataFrame(columns=columns)
    
    for index in range(len(attr)):
        vector = attr.iloc[index,-54:].tolist()
        sliced_data = data.iloc[:, vector]
        
        # Perform knn
        r = []
        for k in ks:
            knn = KNeighborsClassifier(n_neighbors=k)
            score = cross_val_score(knn, sliced_data, target, cv=10)
            r.append(score.mean())
        
        new_df.loc[index] = attr.iloc[0,:-54].tolist() + r
    
    return new_df

In [6]:
# Perform knn for PSO
#knn_pso_result = perform_knn(data, pso)
# Perform knn for GA
#knn_ga_result = perform_knn(data, ga)

# Store results
#knn_pso_result.to_csv('results/knn_pso.csv')
#knn_ga_result.to_csv('results/knn_ga.csv')

# Comparative results

Bellow, it's shown the best results of each algorithm for each *k*.

In [30]:
# PSO results
knn_pso = pd.read_csv('results/knn_pso.csv')
# GA results
knn_ga = pd.read_csv('results/knn_ga.csv')

knn_original = pd.DataFrame(columns=(['n_attr', 'accuracy', 'ks']))
index = 0
for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, data, target, cv=10)
    knn_original.loc[index] = [54, score.mean(), k]
    index += 1
    

## Original Dataset

In [31]:
knn_original.style

Unnamed: 0,n_attr,accuracy,ks
0,54,0.816998,7
1,54,0.803062,10
2,54,0.783195,15
3,54,0.768581,20
4,54,0.75813,25


# Particle Swarm Optimization

In [32]:
def get_n_selected(data, index):
    row = data.loc[index]
    return row.iloc[-54:].tolist().count(True)

pso_columns = ['n_attr'] + ['swarm_size', 'max_iterations', 'accuracy', 'ks']
pso_best_results = pd.DataFrame(columns=pso_columns)

index = 0
for i in knn_pso.columns.tolist()[-5:]:
    row = knn_pso.loc[knn_pso[i].idxmax()]
    pso_best_results.loc[index] = [get_n_selected(pso, int(row['Unnamed: 0'])), row['swarm_size'], row['max_iterations'], row[i], i]
    index += 1
    
pso_best_results.style

Unnamed: 0,n_attr,swarm_size,max_iterations,accuracy,ks
0,33,25,50,0.812162,k7
1,33,25,50,0.796924,k10
2,33,25,50,0.778096,k15
3,26,25,50,0.763641,k20
4,26,25,50,0.754022,k25


## Genetic Algorithm

In [None]:
ga_columns = ['n_attr'] + ['swarm_size', 'max_iterations', 'accuracy', 'ks']
pso_best_results = pd.DataFrame(columns=pso_columns)

index = 0
for i in knn_pso.columns.tolist()[-5:]:
    row = knn_pso.loc[knn_pso[i].idxmax()]
    pso_best_results.loc[index] = [get_n_selected(pso, int(row['Unnamed: 0'])), row['swarm_size'], row['max_iterations'], row[i], i]
    index += 1
    
pso_best_results.style