In [1]:
def predict(row, coefs):
  y = coefs[0] # first coefficient refers to b0 (bias, y-intercept)
  
  for i in range(len(row) - 1):
    y += coefs[i + 1] * row[i]
  
  return y


def coefficients_sgd(train, learning_rate, epochs):
  coefs = [0.0] * len(train[0])
  
  for epoch in range(epochs):
    error_sum = 0
    for row in train:
      y = predict(row, coefs)
      error = y - row[-1]
      error_sum += pow(error, 2)
      coefs[0] = coefs[0] - learning_rate * error
      for i in range(len(row) - 1):
        coefs[i + 1] = coefs[i + 1] - learning_rate * error * row[i]
    print(f"epoch={epoch}; lrate={learning_rate}; error={error_sum}")
  
  return coefs


dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
learning_rate = 0.001
epochs = 50
coefs = coefficients_sgd(train=dataset, learning_rate=learning_rate, epochs=epochs)
display(coefs)

epoch=0; lrate=0.001; error=46.23569225016471
epoch=1; lrate=0.001; error=41.305142323835085
epoch=2; lrate=0.001; error=36.92968879875065
epoch=3; lrate=0.001; error=33.046843407651664
epoch=4; lrate=0.001; error=29.601151923716085
epoch=5; lrate=0.001; error=26.543402390484545
epoch=6; lrate=0.001; error=23.82992247422831
epoch=7; lrate=0.001; error=21.421955907121237
epoch=8; lrate=0.001; error=19.285109118735654
epoch=9; lrate=0.001; error=17.38886015544335
epoch=10; lrate=0.001; error=15.706122876572774
epoch=11; lrate=0.001; error=14.212860205347214
epoch=12; lrate=0.001; error=12.88774091297372
epoch=13; lrate=0.001; error=11.7118350357667
epoch=14; lrate=0.001; error=10.668343576747102
epoch=15; lrate=0.001; error=9.742358632631682
epoch=16; lrate=0.001; error=8.920650521505976
epoch=17; lrate=0.001; error=8.191478871959525
epoch=18; lrate=0.001; error=7.544424976557279
epoch=19; lrate=0.001; error=6.970243016110007
epoch=20; lrate=0.001; error=6.4607280306236845
epoch=21; lrat

[0.22998234937311363, 0.8017220304137576]

In [3]:
from csv import reader
from random import seed, randrange
from math import sqrt


seed(42)


def load_csv(filepath):
  dataset = []
  
  with open(filepath, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if row:
        dataset.append(row)
  
  return dataset


def str_col_to_float(dataset, col):
  for row in dataset:
    row[col] = float(row[col].strip())


def dataset_minmax(dataset):
  minmax = []

  for i in range(len(dataset[0])):
    col_vals = [row[i] for row in dataset]
    minmax.append([min(col_vals), max(col_vals)])

  return minmax


def normalize_dataset(dataset):
  minmax = dataset_minmax(dataset)
  
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
    

def cross_validation_split(dataset, folds):
  dataset_split = []
  _dataset = dataset.copy()
  dataset_split_size = int(len(dataset) / folds)

  for _ in range(folds):
    fold = []
    while len(fold) < dataset_split_size:
      idx = randrange(len(_dataset))
      fold.append(_dataset.pop(idx))
    dataset_split.append(fold)
  
  return dataset_split


def rmse_metric(actual, predicted):
  assert len(actual) == len(predicted)

  error_sum_sq = sum(map(lambda a, p: pow(p - a, 2), actual, predicted))
  mean_error = error_sum_sq / float(len(actual))

  return sqrt(mean_error)


def evaluate_algorithm(dataset, algorithm, folds, *args):
  folds = cross_validation_split(dataset=dataset, folds=folds)
  scores = []
  
  for fold in folds:
    train = folds.copy()
    train.remove(fold)
    train = sum(train, [])
    test = []
    for row in fold:
      row_copy = row.copy()
      row_copy[-1] = None
      test.append(row_copy)
    predicted = algorithm(train, test, *args)
    actual = list(map(lambda row: row[-1], fold))
    rmse = rmse_metric(actual=actual, predicted=predicted)
    scores.append(rmse)
  
  return scores


def linear_regression_algorithm_with_sdg(train, test, learning_rate, epochs):
  predictions = []
  coefs = coefficients_sgd(train=train, learning_rate=learning_rate, epochs=epochs)
  
  for row in test:
    yhat = predict(row=row, coefs=coefs)
    predictions.append(yhat)
  
  return predictions

In [4]:
filepath = "../datasets/wine-quality-white.csv"
dataset = load_csv(filepath=filepath)
for i in range(len(dataset[0])):
  str_col_to_float(dataset=dataset, col=i)
normalize_dataset(dataset=dataset)
scores = evaluate_algorithm(dataset, linear_regression_algorithm_with_sdg, 5, 0.01, 50)
display(f"scores: {scores}; mean_rsme={sum(scores) / float(len(scores))}")

epoch=0; lrate=0.01; error=78.44145597829196
epoch=1; lrate=0.01; error=67.08865711810463
epoch=2; lrate=0.01; error=65.57214322521925
epoch=3; lrate=0.01; error=64.72768916656149
epoch=4; lrate=0.01; error=64.22769714871525
epoch=5; lrate=0.01; error=63.92180281142388
epoch=6; lrate=0.01; error=63.728334313520534
epoch=7; lrate=0.01; error=63.60188382479972
epoch=8; lrate=0.01; error=63.5165259056815
epoch=9; lrate=0.01; error=63.45702565414449
epoch=10; lrate=0.01; error=63.41417762390333
epoch=11; lrate=0.01; error=63.38227650905153
epoch=12; lrate=0.01; error=63.35770614688372
epoch=13; lrate=0.01; error=63.33812952925595
epoch=14; lrate=0.01; error=63.32201073889288
epoch=15; lrate=0.01; error=63.30832543914003
epoch=16; lrate=0.01; error=63.296381471496304
epoch=17; lrate=0.01; error=63.28570544047876
epoch=18; lrate=0.01; error=63.275969783710586
epoch=19; lrate=0.01; error=63.266945198056895
epoch=20; lrate=0.01; error=63.25846922935466
epoch=21; lrate=0.01; error=63.2504253204

'scores: [0.12353443065407795, 0.1299033492359778, 0.12357271484541896, 0.12802250691031047, 0.12599780793314658]; mean_rsme=0.12620616191578632'