# KNN

## Data Preprocessing 

### Training Data

In [None]:
import csv
import numpy as np
import pandas as pd

# Below is an example how data preprocessing can be achieved with pandas module
train = pd.DataFrame()
test = pd.DataFrame()

train = pd.read_csv('trainData.csv',  dtype={'Color': str,'Radius (cm)':float,'Weight (grams)':float})
#########################  DATA Preprocessing  #############################
    
# drop duplicated rows
train = train.drop_duplicates()
    
# replace zeros entries with np.nan
train = train.replace(0,np.nan)
train = train.dropna() # drop all nan entiries 
    
# transform the nominal feature (color) to dummy
newCols = pd.get_dummies(train.iloc[:,0]) 
train = pd.concat([newCols, train], axis=1) # add them to the transfomed columns to the beggining  of the data frame
train = train.drop(train.columns[3], axis = 1) # drop nominal column (color)    

## normalization
min_radius = min(train.iloc[:,3])
max_radius = max(train.iloc[:,3])
train['Radius (cm)'] = list(map(lambda x: (float(x)-min_radius)/(max_radius-min_radius), train.iloc[:,3]))

## normalization
min_weight = min(train.iloc[:,4])
max_weight = max(train.iloc[:,4])
train['Weight (grams)'] = list(map(lambda x: (float(x)-min_weight)/(max_weight-min_weight), train.iloc[:,4]))

train

### Test Data

In [None]:
test = pd.DataFrame()
test = pd.read_csv('testData.csv',  dtype={'Color': str,'Radius (cm)':float,'Weight (grams)':float})

#########################  DATA Preprocessing  #############################
    
# drop duplicated rows
test= test.drop_duplicates()

# transform the nominal feature (color) to dummy
newCols=pd.get_dummies(test.iloc[:,0]) 
test = pd.concat([newCols, test], axis=1) # add them to the transfomed columns to the beggining  of the data frame
test= test.drop(test.columns[3], axis = 1) # drop nominal column (color)    

## normalization like training data
test['Radius (cm)'] = list(map(lambda x: (float(x)-min_radius)/(max_radius-min_radius), test.iloc[:,3]))

## normalization like training data
test['Weight (grams)'] = list(map(lambda x: (float(x)-min_weight)/(max_weight-min_weight), test.iloc[:,4]))

test

# kNN Classification with Manual K

In [None]:
from math import sqrt
 
# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    #loop through train dataset and calcluate Euclidean distance between each train row and given test row 
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

# Locate the most similar neighbors
def get_neighbors(train, test_row, K):
    distances = list()
    
    for index, train_row in train.iterrows():
        dist = euclidean_distance(train_row,test_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1]) # sort according to Euclidean distance
    neighbors = list()
    neighbors.clear()
    for i in range(K): #getting only the K neighbours
        neighbors.append(distances[i][0])
    return neighbors

# Make a classification prediction with neighbors
def predict_classification(train, test_row, K):
    neighbors = get_neighbors(train, test_row, K)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

# main program, method for easy calling
def my_main_method(K):
#     Testrows = test[['Green','Red','Yellow','Radius (cm)','Weight (grams)','Original (class)']]
#     prediction_result = pd.DataFrame(data=None, columns=test.columns, index=None) # make similar dataframe without data

    rows_list=[] # List to gether resultant rows
    for index, rw in test.iterrows():
        prediction = predict_classification(train, rw, K)     # get predicted class
        rw[5]=prediction # update predicted class in test data prediction column
        rows_list.append(rw)
#         prediction_result.loc[-1] = rw # not working, have to use list    
#         print('Expected %s, Got %s.' % (rw[-2], prediction))
#     print(prediction_result)
    prediction_result = pd.DataFrame(rows_list)
    return prediction_result
    
    
########################################################################################
#######                start the program with user input                ################
########################################################################################
try:
    K=int(input("Enter Value for K:"))
except ValueError:
    print("Error!!")
else:
    prediction_result = my_main_method(K) # calling main method with value of K input by user
prediction_result

In [None]:
# Calculateing Precision

from sklearn.metrics import precision_score

# making method for easy calling
def calculate_Precision(y_true, y_pred):    
    return precision_score(y_true, y_pred, average='micro')

y_true = prediction_result['Original (class)'].tolist()
y_pred = prediction_result['Predicted (class)'].tolist()

p=calculate_Precision(y_true, y_pred)
print('precision = %.1f when K = %.d' % (p * 100, K))

#  kNN Classification with Automatic K Estimation

In [None]:
K=1 # starting value of K
max_K=len(train) # in order to avoid infinite loop. Setting max value for K equal to total datapoints
program_precision=0 # initializa programs precision at start

try:
    Min_Precision=float(input("Enter minimum precision required:"))
    Min_Precision = Min_Precision/100 # turn into percentage value
except ValueError:
    print("Error!!")
else:
    while program_precision < Min_Precision: 
#         print('Min_Precision %.3f' % (Min_Precision))
#         print('program_precision %.3f' % (program_precision))
        
        prediction_result = my_main_method(K)  # calling main method with starting value of K=1

        y_true = prediction_result['Original (class)'].tolist()
        y_pred = prediction_result['Predicted (class)'].tolist()

        # checking if output precision meets the criteria of min precision
        program_precision = calculate_Precision(y_true, y_pred)
        if program_precision < Min_Precision: # checking if we have reaced the required precision or not
            K=K+1 # increase K to increase precision (hopefully)
            
            if K>max_K: # in order to avoid infinite loop
                print('precision = %.1f when K = %.d' % (program_precision * 100, K))
                print('')
                print('*****************************************')                
#                 print('Max K reached at at K = %.1f' % (K))
                print('***********   Max K Reached   ***********')
                print('*****************************************')
                break
                
            prediction_result = my_main_method(K)  # calling main method again after increasing value of K
            
            #getting **UPDATED** results and predictions
            y_true = prediction_result['Original (class)'].tolist()
            y_pred = prediction_result['Predicted (class)'].tolist()
            
            program_precision = calculate_Precision(y_true, y_pred)    #calculating precision again
            
        print('precision = %.1f when K = %.d' % (program_precision * 100, K))
