# 1-NN using 3 metrics: Manhattan, Euclidian and Chessboard

First of all let's import the dataset

In [1]:
#The dataset will be saved in variable: df
import csv
#path of my file
p= "D:\\CIC\\CIP\\Tareas\\Tarea_12\\sonar.csv"

with open(p, newline='') as csvfile:
    
    # create a CSV reader object
    csvreader = csv.reader(csvfile, delimiter=',')
    
    # create a list to hold the rows of the CSV file
    df = []
    
    # loop through each row in the CSV file
    for row in csvreader:
        
        # convert the row to a list of numbers
        num_row = [float(x) for x in row]
        
        # add the num_row to the list of rows
        df.append(num_row)
        
# print the 2D list
print(len(df))

208


The next code will help us to check if there're repeated or redundant patterns

In [2]:
import pandas as pd

#this function helps to clean the dataset

def process_dataset(lst):
    df=pd.DataFrame(lst)
    # Remove columns where all entries are the same
    df = df.loc[:, (df != df.iloc[0]).any()]

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # Remove rows with duplicate values in all but the last column
    df_without_last = df.iloc[:, :-1]
    df.drop_duplicates(subset=df_without_last.columns.tolist(), inplace=True, keep=False)

    arr = df.values.tolist()

    return arr

In [3]:
df=process_dataset(df)
print(len(df))

208


### Note that the dataset was already clean

### Let's define a function of Minkowski metric

In [3]:
# a and b should be vectors of the same lenght to compute the distance

# k is the value I want for the metric of Minkowski

def mink_metric(k,a,b):
    
    n=len(a)
    
    s=0 #sum
    
    for i in range(n):
        value= (abs(a[i]-b[i]))**k
        
        s=s+value
        
    s= s**(1/k)
    
    return s

In [4]:
a=[2,3]
b=[5,7]

s=mink_metric(5,a,b)
print(s)

4.174027662897746


### Let's define a function that compute the Chebyshev/Chessboard distance

In [5]:
# a and b should be vectors of the same lenght to compute the distance

# k is the value I want for the metric of Minkowski

def chebyshev_metric(a,b):
    
    n=len(a)
    
    values=[]
    
    for i in range(n):
        value= (abs(a[i]-b[i]))
        
        values.append(value)
        
    s= max(values)
    
    return s

In [6]:
a=[2,3]
b=[5,7]

s=chebyshev_metric(a,b)

print(s)

4


### Let's procced to do the 1-NN with LOOCV

### GOAL: 1-NN with LOOCV from the data set, compute the confusion matrix using 3 different metrics

### METRIC: City Block/Manhattan

In [7]:
num_attributes= len(df[0])

last_pos_index = 0

for i in range(len(df)):
    if df[i][num_attributes-1]==1:
        last_pos_index=i

print(last_pos_index)

110


In [8]:
#remove the last column of the dataset
for row in df:
    row.pop()

In [13]:
from collections import Counter
import math
from tabulate import tabulate

#K-NN with Leave One Out Cross Validation
def knn_LOOCV(k,minkowski_value,df_validation,class_positive,class_negative,index_test,last_positive_index,TP,FN,FP,TN):
    
    train_df = df_validation #dataset where the first patterns are all positive class and the second part are all negative class

    pattern_index = index_test
    test_df = df_validation[pattern_index]

    n=last_positive_index
    
    if pattern_index<=n:
        class_of_test_pattern = class_positive
    else:
        class_of_test_pattern = class_negative

    distances = []

    for i in range(len(train_df)):
        
        #the lines below correspond to different distances 
        
        #dist = round(math.dist(test_df, train_df[i]),4) #Euclidian metric
        #dist = round(mink_metric(minkowski_value,test_df,train_df[i]),4) #minkowski metric
        dist = round(chebyshev_metric(test_df,train_df[i]),4) #Chebyshev metric

        if i<=n:
            distances.append([dist,i+1,class_positive]) #it need to be sum one because of the python numeration
        else:
            distances.append([dist,i+1,class_negative])

    del distances[pattern_index] #delete the list of distance from the test pattern to itself       

    kNN=k
    sorted_dist = sorted(distances, key=lambda x: x[0])[:kNN]
    #print(sorted_dist)
    #create the table of the top 3 min sorted distances

    # create the table headers
    #headers = ['Distancia', 'Patrón', 'Clase del patrón']

    #table_latex = tabulate(sorted_dist, headers,floatfmt=".4f", tablefmt='latex')

    # print(f"\n\item Patrón {pattern_index+1}. Ver tabla \\ref{{fig{pattern_index+1}}} \n")
    # print("\\begin{table}")
    # print("\centering")
    # print(table_latex)
    # print(f"\caption{{Tabla para el patrón {pattern_index+1} (NUTT Dataset)}}")
    # print(f"\label{{fig{pattern_index+1}}}")
    # print("\\end{table}")


    #now i want to count the most repeated class from the KNN list

    # extract the third element from each sublist and put them in a separate list
    third_elements = [sublist[2] for sublist in sorted_dist]

    # use Counter to count the frequency of each value in the third_elements list
    counter = Counter(third_elements)

    most_common_class = counter.most_common(1)[0][0] #this print the most common class

    #print(f"\nLa clase que le asgina el clasificador: {most_common_class}")
    
    if most_common_class == class_of_test_pattern:
        
        #print("\nFue un \\textbf{ACIERTO}")
        
        if class_of_test_pattern==class_positive:
            TP +=1
        elif class_of_test_pattern==class_negative:
            TN +=1
    else:
        #print("\nFue un \\textbf{ERROR}")

        if class_of_test_pattern==class_positive:
            FN +=1
        elif class_of_test_pattern==class_negative:
            FP +=1
    return TP,FN,FP,TN

### We're going to suppose that the positive class is 1 and negative is 2

In [11]:
class_positive=1
class_negative=2

n=last_pos_index

TP=0
FN=0
FP=0
TN=0

#minkowski with m=1
for i in range(len(df)):
    TP,FN,FP,TN=knn_LOOCV(1,1,df,class_positive,class_negative,i,n,TP,FN,FP,TN)

    
CM=[[TP,FN],[FP,TN]]
print(CM)

[[98, 13], [18, 79]]


### METRIC: Euclidian

In [12]:
class_positive=1
class_negative=2

n=last_pos_index

TP=0
FN=0
FP=0
TN=0

#euclidian metric
for i in range(len(df)):
    TP,FN,FP,TN=knn_LOOCV(1,2,df,class_positive,class_negative,i,n,TP,FN,FP,TN)

CM=[[TP,FN],[FP,TN]]
print(CM)

[[96, 15], [21, 76]]


### Metric: Chebyshev

In [14]:
class_positive=1
class_negative=2

n=last_pos_index

TP=0
FN=0
FP=0
TN=0

for i in range(len(df)):
    TP,FN,FP,TN=knn_LOOCV(1,1,df,class_positive,class_negative,i,n,TP,FN,FP,TN)

CM=[[TP,FN],[FP,TN]]
print(CM)

[[96, 15], [28, 69]]
