In [32]:
from random import seed, randrange
from math import sqrt
from csv import reader


seed(42)


def load_csv(filepath):
  dataset = []

  with open(filepath, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if row:
        dataset.append(row)
  
  return dataset


def str_col_to_float(dataset, col):
  for row in dataset:
    row[col] = float(row[col].strip())


def str_col_to_int(dataset, col):
  class_vals = set(list(map(lambda row: row[col], dataset)))
  class_lookup = {}

  for i, value in enumerate(class_vals):
    class_lookup[value] = i
  for row in dataset:
    row[col] = class_lookup[row[col]]
  
  return class_lookup


def dataset_minmax(dataset):
  minmax = []

  for i in range(len(dataset)):
    col_vals = list(map(lambda row: row[i], dataset))
    minmax.append([min(col_vals), max(col_vals)])
  
  return minmax


def normalize_dataset(dataset):
  minmax = dataset_minmax(dataset=dataset)
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


def cross_validation_split(dataset, n_folds):
  dataset_folds = []
  fold_size = int(len(dataset) / n_folds)
  dataset_ = dataset.copy()

  for _ in range(n_folds):
    fold = []
    while len(fold) < fold_size:
      fold.append(dataset_.pop(randrange(len(dataset_))))
    dataset_folds.append(fold)

  return dataset_folds


def accuracy_metric(actual, predicted):
  assert len(actual) == len(predicted)

  correct_count = 0
  size = len(actual)

  for i in range(size):
    correct_count += 1 if actual[i] == predicted[i] else 0
  
  return (correct_count / float(size)) * 100.0


def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset=dataset, n_folds=n_folds)
  scores = []

  for fold in folds:
    train = folds.copy()
    train.remove(fold)
    train = sum(train, [])
    test = []
    for row in fold:
      row_ = row.copy()
      row_[-1] = None
      test.append(row_)
    predicted = algorithm(train, test, *args)
    actual = list(map(lambda row: row[-1], fold))
    accuracy = accuracy_metric(actual=actual, predicted=predicted)
    scores.append(accuracy)
  
  return scores


def euclidean_distance(row1, row2):
  assert len(row1) == len(row2)

  distance = 0.0

  for i in range(len(row1) - 1):
    distance += (row1[i] - row2[i]) ** 2

  return sqrt(distance)


def get_k_neighbors(dataset, row, k):
  distances = []

  for dataset_row in dataset:
    distances.append((dataset_row, euclidean_distance(row, dataset_row)))
  distances.sort(key=lambda tup: tup[1])

  return [distances[i][0] for i in range(k)]


def predict_classification(dataset, row, k):
  neighbors = get_k_neighbors(dataset=dataset, row=row, k=k)
  class_vals = list(map(lambda row: row[-1], neighbors))
  
  return max(set(class_vals), key=class_vals.count)


def k_nearest_neighbors(train, test, k):
  predictions = []

  for row in test:
    predictions.append(predict_classification(dataset=train, row=row, k=k))
  
  return predictions


filepath = "../datasets/abalone.csv"
dataset = load_csv(filepath=filepath)
for col in range(1, len(dataset[0])):
  str_col_to_float(dataset=dataset, col=col)
str_col_to_int(dataset=dataset, col=0)
n_folds = 5
k_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, k_neighbors)
display(f"scores={scores}")
display(f"mean accuracy={sum(scores) / len(scores)}")

'scores=[22.395209580838323, 24.550898203592812, 26.82634730538922, 26.107784431137727, 24.67065868263473]'

'mean accuracy=24.910179640718564'