In [3]:
# Author: Onur Alp Bicer
#         Shengian (sorry i don't know your last name)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv

# floating point percentage of the data to use as training data, rest will be used as test data
split_ratio = 0.8

# k 
k = 5

In [4]:
# Attribute mappings for Automobile dataset
def getDict(list_attr):
    nums = range(len(list_attr))
    return dict(zip(list_attr, nums))

make = getDict(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda', 'isuzu', 'jaguar', 'mazda', 
        'mercedes-benz', 'mercury', 'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche',
        'renault', 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'])

fuel_type = getDict(['diesel', 'gas'])
aspiration = getDict(['std', 'turbo'])
num_doors = getDict(['four', 'two'])
num_doors['?'] = -1

body_style = getDict(['hardtop', 'wagon', 'sedan', 'hatchback', 'convertible'])

drive_wheels = getDict(['4wd', 'fwd', 'rwd'])

engine_location = getDict(['front', 'rear'])
engine_type = getDict(['dohc', 'dohcv', 'l', 'ohc', 'ohcf', 'ohcv', 'rotor'])
num_cylinders = getDict(['eight', 'five', 'four', 'six', 'three', 'twelve', 'two'])
fuel_system = getDict(['1bbl', '2bbl', '4bbl', 'idi', 'mfi', 'mpfi', 'spdi', 'spfi'])

In [5]:
# Helper function to split data
def split_data(data, split_ratio):
    np.random.shuffle(data)
    
    n = int(split_ratio * len(data))
    train_data_X = data[:n,:-1]
    train_data_Y = data[:n, -1]
    test_data_X = data[n:, :-1]
    test_data_Y = data[n:, -1]
    
    return train_data_X, train_data_Y, test_data_X, test_data_Y    

In [6]:
# Unpack data from ionosphere data by UCI https://archive.ics.uci.edu/ml/datasets/ionosphere
def getIonosphereData(split_ratio):
    data = []
    with open('ionosphere.data') as file:
        csv_reader = csv.reader(file, delimiter=',')
        for row in csv_reader:
            if row[34] == 'g':
                row[34] = 1
            elif row[34] == 'b':
                row[34] =  0
            else:
                print("Unknown label encountered while parsing dataset")
                exit(1)
            data.append(np.asarray(row, dtype=float))

    data = np.array(data)
    
    return split_data(data, split_ratio)
    
train_data_X, train_data_Y, test_data_X, test_data_Y = getIonosphereData(split_ratio)

In [7]:
# Unpack data from automobile data by UCI https://archive.ics.uci.edu/ml/datasets/automobile
def getAutomobileData(split_ratio, noncontinuous=False):
    data = []
    
    with open('imports-85.data') as file:
        csv_reader = csv.reader(file, delimiter=',')
        if noncontinuous == True:
            for row in csv_reader:
                add = True
                row[2] = make[row[2]] 
                row[3] = fuel_type[row[3]]
                row[4] = aspiration[row[4]]
                row[5] = num_doors[row[5]]
                row[6] = body_style[row[6]]
                row[7] = drive_wheels[row[7]]
                row[8] = engine_location[row[8]]
                row[14] = engine_type[row[14]]
                row[15] = num_cylinders[row[15]]
                row[17] = fuel_system[row[17]]

                for i in range(len(row)):
                    if row[i] == '?':
                        add = False

                if add == True:
                    data.append(np.asarray(row, dtype=float))
                    
        elif noncontinuous == False:
            for row in csv_reader:
                add = True
                row = [row[0], row[1], row[9], row[10], row[11], row[12], row[13], row[16], row[18],
                       row[19], row[20], row[21], row[22], row[23], row[24], row[25]]
                
                for i in range(len(row)):
                    if row[i] == '?':
                        add = False

                if add == True:
                    data.append(np.asarray(row, dtype=float))
        else:
            print("Unknown parameter")
            exit(1)
            
    data = np.array(data)
    
    return split_data(data, split_ratio)
    
# train_data_X, train_data_Y, test_data_X, test_data_Y = getAutomobileData(split_ratio, False)

In [13]:
# https://math.stackexchange.com/questions/139600/how-do-i-calculate-euclidean-and-manhattan-distance-by-hand

# This function returns the pairwise manhattan distance of 2 points in n dimensions(formula on the link above)
def manhattan_distance(x1, x2):
    return np.abs((x1 - x2)).sum()
    
# This function returns the pairwise euclidean distance of 2 points in n dimensions(formula on the link above)
def euclidean_distance(x1, x2):
    return np.sqrt(np.square((x1 - x2)).sum())

In [14]:
# Testing the distances
x1 = np.array([0, 0])
x2 = np.array([3, 4])
print("Expected Euclidean Distance for (0,0) and (3,4) = 5")
print("Actual Euclidean Distance = " + str(euclidean_distance(x1, x2)))
print("\nExpected Manhattan Distance for (0,0) and (3,4) = 7")
print("Actual Manhattan Distance = " + str(manhattan_distance(x1, x2)))

Expected Euclidean Distance for (0,0) and (3,4) = 5
Actual Euclidean Distance = 5.0

Expected Manhattan Distance for (0,0) and (3,4) = 7
Actual Manhattan Distance = 7


In [22]:


def getFirstElement(val):
    return val[0]

# Pseudocode for algorithm in the following website 
# https://www.analyticsvidhya.com/blog/2018/03/introduction-k-neighbours-algorithm-clustering/
def knn(X_train, Y_train, X_test, k=5, distance='Euclidean'):
    vals = []
    for i in range(len(X_train)):
        if distance == 'Euclidean':
            d = euclidean_distance(X_train[i], X_test)
        elif distance == 'Manhattan':
            d = manhattan_distance(X_train[i], X_test)
        else:
            print("Unknown distance")
            exit(1)
            
        vals.append([d, Y_train[i]])
        
    print(vals[:5])
    vals = vals.sort(key=vals)
    print(vals[:5])
        
knn(train_data_X, train_data_Y, test_data_X[0], k=5)

[[4.709701307737042, 0.0], [4.400174445985068, 1.0], [3.7439136405638416, 1.0], [4.873365119422102, 1.0], [6.0751965965473085, 0.0]]


TypeError: 'NoneType' object is not subscriptable