# K Nearest Neighbors
## Danae Sánchez Villegas 136040
This code implements the k nearest neighbors algorithm for numerical values using euclidean distance.

We will work with the auto-mpg.csv file but the code is implemented for any numerical base (having a target column)

In [2]:
#Importing necessary libraries 
import pandas as pd
import math as m
import numpy as np

In [3]:
#The base that will be used 
base=pd.read_csv('auto-mpg.csv',sep=',')
base.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504,12.0,70,1,18.0
1,8,350.0,165.0,3693,11.5,70,1,15.0
2,8,318.0,150.0,3436,11.0,70,1,18.0
3,8,304.0,150.0,3433,12.0,70,1,16.0
4,8,302.0,140.0,3449,10.5,70,1,17.0


** Standarize Function**

-input: the complete base (attributes and target), the example to be evaluated, the name of the target column

-output: standarized base and example, and the separated target column

In [4]:
def standarize (complete_base,ex,target_name):
    atts=[x for x in list(complete_base.columns.values) if x != target_name]      #get a list with the attributes names
    base,target=complete_base[atts],complete_base[[target_name]]                  #separate attributes from target column                   
    
    s_ex=[]                                                   #initialize variable
    mean=base.mean()                                          #get the mean per coluumn of the attributes
    std=base.std()                                            #get the standard deviation per coluumn of the attributes
    b=base.values                                             #get the attributes as a numpy array to easier manage it
    
    #standarize the base
    for i in range(len(b)):                                   #iterate i rows
        for j in range(len(base.columns)):                     #iterate j columns
            b[i][j]= (b[i][j]-mean[j])/std[j]                    #reassign the value with the standarized one using the 
                                                                 #formula: x[i][j]-mean[j]/std[j]
    
    s_base = pd.DataFrame(b,columns=atts)                     #keep the standarized attributes as a dataframe
    
    #standarize the example
    for ind,value in enumerate(ex):                           #iterate the example
        s_ex.append((float(value)-mean[ind])/std[ind])          #standarize each element of the example (same formula)
       
    return s_base,s_ex,target                                 #return the standarized base and example and the target col

**Euclidean Distance Function** 

-input: complete standarized (if necesary) base and the example or observation to be evaluated

-output: a list of the ecuclidean distances between each observation and the example

In [5]:
def eucDistance(base, ex):    
    euc_dist,dist=[],0                                 #initializing variables
    for row in base.iterrows():                        #iterate the rows of the base
        for ind,val in enumerate(ex):                  #iterate the columns 
            if not m.isnan(row[1][ind]):               #check if there is a value in the base for the row[1][ind] element
                dist+=(float(val)-row[1][ind])**2      #add the quadratic difference betwwen the base value and 
                                                                                                       #the example value
        dist=m.sqrt(dist)                              #get the square root of the total sum to get the euclidean dstance
        euc_dist.append(dist)                          #add it to the distances list
        dist=0                                         #reinitialize the distance for the next observation
   
    return euc_dist                                    #return the euclidean distances list

**Get Neighbors Function**

-input: the distances list, the number of neighbors to get, the target column

-output: a list with the result of the k nearest neighbors

In [6]:
def get_neighbors(dist_list,k,cat):
    
    cat.insert(len(cat.columns),'dist',dist_list)       #join the target column to the distances list
    cat_dist=cat.sort_values(by='dist')                 #sort both columns by the distance
    to_k, neighbors=1,[]                                #initialize variables
    prev=cat_dist.dist.iloc[0]                          #get the first prev i.e. the first of the distance column-->the nearest
    
    for ind,d in enumerate(cat_dist.dist):          #iterate distances
        if prev != d:                              #if the distance is different from prev
            prev = d                                   #we have a new prev
            if to_k < k:                               #if we haven't reached the k neighbor
                to_k = to_k + 1                           #increment k
            else:                                      #we've finished
                break                                     #get out
                
        neighbors.append(float(cat_dist.iloc[ind][0]))  #keep the result of the neighbor
    
    return neighbors                                #return the list with the result of the k nearest neighbors


** Mean Funtion **

-input: list with the result of the neighbors

-output: mean of the list

In [7]:
def mean_neighbors(neighbors):
    return np.mean(neighbors)

** Evaluating ALgorithm**

-input: the complete base (attributes and target), the example to be evaluated, the number of neighbors to take in account, the target column name

-output: the evaluation of the example

In [8]:
def eval_alg(complete_base, ex, k,target_name):
    #standarize 
    s_base,s_ex,target=standarize(complete_base,ex,target_name)
    
    #get distances
    dist_list=eucDistance(s_base,s_ex)
      
    #get neighhbors
    neighbors = get_neighbors(dist_list, k,target)
 
    return mean_neighbors(neighbors)  #return the mean of the neigbors results

** K nearest neighbors Function**

-input: the name of the file, the name of the target column, the example to be evaluated, the number of neighbors to take in account

-output: the evaluation of the example

In [9]:
def k_neighbors(name_base,symb,target_name,ex,k):
    
    #training algorithm
    complete_base = pd.read_csv(name_base, sep=symb)
      
    #evaluating algorithm
    return eval_alg(complete_base,ex,k,target_name)
    
    

In [10]:
example=pd.Series(['4','96','69','2189','18','72','2'])
k_neighbors('auto-mpg.csv',',','mpg',example,3)

26.0