In [29]:
from random import seed, randrange
from csv import reader
from math import exp


seed(42)


def load_csv(filepath):
  dataset = []

  with open(filepath, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if row:
        dataset.append(row)
  
  return dataset


def str_col_to_float(dataset, col):
  for row in dataset:
    row[col] = float(row[col].strip())


def dataset_minmax(dataset):
  minmax = []

  for col in range(len(dataset[0])):
    col_vals = list(map(lambda row: row[col], dataset))
    minmax.append([min(col_vals), max(col_vals)])

  return minmax


def dataset_normalize(dataset):
  minmax = dataset_minmax(dataset)

  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


def cross_validation_split(dataset, n_folds):
  dataset_folds = []
  fold_size = int(len(dataset) / n_folds)
  _dataset = dataset.copy()

  for _ in range(n_folds):
    fold = []
    while len(fold) < fold_size:
      fold.append(_dataset.pop(randrange(len(_dataset))))
    dataset_folds.append(fold)
  
  return dataset_folds


def accuracy_metric(actual, predicted):
  assert len(actual) == len(predicted)

  size = len(actual)
  correct_count = 0

  for i in range(size):
    correct_count += 1 if actual[i] == predicted[i] else 0
  
  return (correct_count / size) * 100.0


def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset=dataset, n_folds=n_folds)
  scores = []

  for fold in folds:
    train = folds.copy()
    train.remove(fold)
    train = sum(train, [])
    test = []
    for row in fold:
      _row = row.copy()
      _row[-1] = None
      test.append(_row)
    predicted = algorithm(train, test, *args)
    actual = list(map(lambda row: row[-1], fold))
    scores.append(accuracy_metric(actual=actual, predicted=predicted))

  return scores


def predict(row, coefs):
  yhat = coefs[0]
  
  for i in range(len(row) - 1):
    yhat += coefs[i + 1] * row[i]
  
  return 1.0 / (1.0 + exp(-yhat))


def coefficients_sdg(train, l_rate, n_epochs):
  coefs = [0.0] * len(train[0])
  
  for epoch in range(n_epochs):
    for row in train:
      yhat = predict(row=row, coefs=coefs)
      error = row[-1] - yhat
      coefs[0] = coefs[0] + l_rate * error * yhat * (1.0 - yhat)
      for i in range(len(row) - 1):
        coefs[i + 1] = coefs[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
  
  return coefs


def logistic_regression(train, test, l_rate, n_epochs):
  predictions = []
  coefs = coefficients_sdg(train=train, l_rate=l_rate, n_epochs=n_epochs)

  for row in test:
    yhat = round(predict(row=row, coefs=coefs))
    predictions.append(yhat)
  
  return predictions

filepath = "../datasets/pima-indians-diabetes.csv"
dataset = load_csv(filepath=filepath)
for i in range(len(dataset[0])):
  str_col_to_float(dataset=dataset, col=i)
dataset_normalize(dataset=dataset)
n_folds = 5
l_rate = 0.1
n_epochs = 100
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epochs)
display(f"scores={scores}")
display(f"mean accuracy={sum(scores) / float(len(scores))}")

'scores=[78.43137254901961, 79.73856209150327, 73.20261437908496, 79.08496732026144, 70.58823529411765]'

'mean accuracy=76.20915032679738'