In [65]:
from csv import reader
from random import seed, randrange
from math import sqrt


seed(42)


def load_csv(file_path):
  dataset = []

  with open(file_path, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if row:
        dataset.append(row)
  
  return dataset


def str_col_to_float(dataset, col):
  for row in dataset:
    row[col] = float(row[col].strip())


def train_test_split(dataset, split):
  train = list()
  train_size = len(dataset) * split
  test = dataset.copy()

  while len(train) < train_size:
    idx = randrange(len(test))
    train.append(test.pop(idx))
  
  return train, test


def rmse(actual, predicted):
  assert len(actual) == len(predicted)

  error_sum_sq = sum(map(lambda a, p: pow(p - a, 2), actual, predicted))
  mean_error = error_sum_sq / float(len(actual))

  return sqrt(mean_error)


def evaluate_algorithm(dataset, algorithm):
  test_set = list()
  
  for row in dataset:
    row_copy = list(row)
    row_copy[-1] = None
    test_set.append(row_copy)
  predicted = algorithm(dataset, test_set)
  actual = list(map(lambda row: row[-1], dataset))
  
  return rmse(actual, predicted)


def mean(vals):
  return sum(vals) / float(len(vals))


def variance(vals):
  m = mean(vals)

  return sum(map(lambda v: pow(v - m, 2), vals))


def covariance(vals1, vals2):
  mean_1, mean_2 = mean(vals1), mean(vals2)

  return sum(map(lambda v1, v2: (v1 - mean_1) * (v2 - mean_2), vals1, vals2))


def coefficients(dataset):
  x_vals = list(map(lambda row: row[0], dataset))
  y_vals = list(map(lambda row: row[-1], dataset))
  b1 = covariance(x_vals, y_vals) / variance(x_vals)
  b0 = mean(y_vals) - b1 * mean(x_vals)
  
  return b0, b1


def simple_linear_regresion(train, test):
  predictions = []
  b0, b1 = coefficients(train)

  for row in test:
    y = b1 * row[0] + b0
    predictions.append(y)
  
  return predictions


filepath = "../datasets/auto-insurance-sweden.csv"
dataset = load_csv(file_path=filepath)

for i in range(len(dataset[0])):
  str_col_to_float(dataset, i)

b0, b1 = coefficients(dataset)
display(f"coefficients: b0={b0:.2f}, b1={b1:.2f}; formula: y = {b1:.2f}x + {b0:.2f}")
display(f"metrics: rmse={evaluate_algorithm(dataset, simple_linear_regresion):.2f}")

'coefficients: b0=19.99, b1=3.41; formula: y = 3.41x + 19.99'

'metrics: rmse=35.37'