# Instance based methods: Exercise 2

## Appl k-nearest neighbors algorithm on Iris dataset

### Import iris dataset

In [77]:
from csv import reader
from math import sqrt
 
def load_iris_dataset(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

### Convert numeric string values in dataset to floats

In [66]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

### Convert target classes to integer values

In [67]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
        print('[%s] => %d' % (value, i))
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

### Find the min and max values for each column

In [68]:
def min_max(data):
    minmax = list()
    for i in range(len(data[0])):
        col_values = [row[i] for row in data]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

### Normalize dataset in interval (0,1)

In [69]:
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

### Calculate euclidean distance

In [70]:
from math import sqrt

def calculate_distance(instance1, instance2):
    distance = 0.0
    for i in range(len(instance1)-1):
        distance += (instance1[i] - instance2[i])**2
    return sqrt(distance)

### Get neighbors

In [71]:
def get_neighbors(training_data, test_instance, k):
    distances = []
    for train_instance in training_data:
        dist = calculate_distance(test_instance, train_instance)
        distances.append((train_instance, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

### Make prediction on class

In [72]:
def predict_class(training_data, test_instance, k):
    neighbors = get_neighbors(training_data, test_instance, k)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

Wire everything up, make prediction for [4.8,2.5,5.3,2.4] and print result. 

In [76]:
data = load_iris_dataset('iris.csv')
for i in range(len(data[0])-1):
    str_column_to_float(data, i)
str_column_to_int(data, len(data[0])-1)
k = 5
test = [4.8,2.5,5.3,2.4]
prediction = predict_class(data, test, k)
print('Data=%s, Predicted: %s' % (row, prediction))

[Iris-setosa] => 0
[Iris-versicolor] => 1
[Iris-virginica] => 2
Data=[4.8, 2.5, 5.3, 2.4], Predicted: 2
