In [33]:
from random import seed, randrange
from math import sqrt, exp, pi
from csv import reader


seed(42)


def load_csv(filepath):
  dataset = []

  with open(filepath, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if row:
        dataset.append(row)
  
  return dataset


def str_col_to_float(dataset, col):
  for row in dataset:
    row[col] = float(row[col].strip())


def str_col_to_int(dataset, col):
  class_vals = set(list(map(lambda row: row[col], dataset)))
  class_idx_map = dict()

  for idx, cv in enumerate(class_vals):
    class_idx_map[cv] = idx
  for row in dataset:
    row[col] = class_idx_map[row[col]]


def cross_validation_split(dataset, n_folds):
  dataset_folds = []
  fold_size = int(len(dataset) / n_folds)
  dataset_ = dataset.copy()

  for _ in range(n_folds):
    fold = []
    while len(fold) < fold_size:
      fold.append(dataset_.pop(randrange(len(dataset_))))
    dataset_folds.append(fold)
  
  return dataset_folds


def accuracy_metric(actual, predicted):
  assert len(actual) == len(predicted)

  size = len(actual)
  correct_count = 0

  for i in range(size):
    correct_count += 1 if actual[i] == predicted[i] else 0
  
  return (correct_count / float(size)) * 100


def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset=dataset, n_folds=n_folds)
  scores = []

  for fold in folds:
    train = folds.copy()
    train.remove(fold)
    train = sum(train, [])
    test = []
    for row in fold:
      row_ = row.copy()
      row_[-1] = None
      test.append(row_)
    predicted = algorithm(train, test, *args)
    actual = list(map(lambda row: row[-1], fold))
    accuracy = accuracy_metric(actual=actual, predicted=predicted)
    scores.append(accuracy)
  
  return scores


def separate_by_class(dataset):
  rows_by_class = {}
  
  for i in range(len(dataset)):
    class_val = dataset[i][-1]
    if class_val not in rows_by_class:
      rows_by_class[class_val] = []
    rows_by_class[class_val].append(dataset[i])
  
  return rows_by_class


def mean(numbers):
  return sum(numbers) / len(numbers)


def stddev(numbers):
  avg = mean(numbers)
  
  variance = sum(list(map(lambda x: pow(x - avg, 2), numbers))) / float(len(numbers) - 1)
  
  return sqrt(variance)


def summarize_dataset(dataset):
  summaries = [(mean(col), stddev(col), len(col)) for col in zip(*dataset)]
  
  del(summaries[-1])

  return summaries


def summarize_by_class(dataset):
  separated = separate_by_class(dataset=dataset)
  summaries = {}

  for class_val, rows in separated.items():
    summaries[class_val] = summarize_dataset(dataset=rows)
  
  return summaries


def gaussian_pdf(x, mean_val, stddev_val):
  exponent = exp(-((x - mean_val) ** 2 / (2 * stddev_val ** 2)))
  
  return (1 / (sqrt(2 * pi) * stddev_val)) * exponent


def calculate_class_probabilities(summaries, row):
  n_rows = sum([summaries[label][0][2] for label in summaries])
  probabilities = {}

  for class_val, class_summaries in summaries.items():
    probabilities[class_val] = summaries[class_val][0][2] / float(n_rows)
    for i in range(len(class_summaries)):
      mean_val, stddev_val, _ = class_summaries[i]
      probabilities[class_val] *= gaussian_pdf(x=row[i], mean_val=mean_val, stddev_val=stddev_val)
  
  return probabilities


def predict(summaries, row):
  probabilities = calculate_class_probabilities(summaries=summaries, row=row)
  best_label, best_prob = None, -1

  for class_val, probability in probabilities.items():
    if best_label is None or probability > best_prob:
      best_prob = probability
      best_label = class_val
  
  return best_label


def naive_bayes(train, test):
  summaries = summarize_by_class(dataset=train)
  predictions = []
  
  for row in test:
    predictions.append(predict(summaries=summaries, row=row))
  
  return predictions


filepath = "../datasets/iris.csv"
dataset = load_csv(filepath=filepath)
for i in range(len(dataset[0]) - 1):
  str_col_to_float(dataset=dataset, col=i)
str_col_to_int(dataset=dataset, col=len(dataset[0]) - 1)
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
display(f"scores={scores}")
display(f"mean accuracy={sum(scores) / len(scores)}")

'scores=[96.66666666666667, 96.66666666666667, 100.0, 93.33333333333333, 90.0]'

'mean accuracy=95.33333333333334'