In [None]:
def costfcn_v(y, X, b, w):
  """Calculate the mean-squared error cost function for a bivariate linear regression.

  Args:
    y: 1D ndarray of the target variable.
    X: 1D ndarray of the input features.
    b: bias.
    w: weight.

  Returns:
    Cost function value.
  """

  y_hat = b + w * X
  J = np.mean((y_hat - y) ** 2) / 2

  return J

def gradient_v(y, X, b, w):
  """Calculate gradients of the cost function.

  Args:
    y: 1D ndarray of the target variable.
    X: 1D ndarray of the input features.
    b: bias.
    w: weight.

  Returns:
    dJ/db, dJ/dw.
  """

  y_hat = b + w * X
  dJdb = np.mean(y_hat - y)
  dJdw = np.mean((y_hat - y) * X)

  return dJdb, dJdw

def graident_descent_v(y, X, alpha=0.1, tol=1.e-7):
  """Estimate parameters using a gradient decent algorithm.

  Args:
    y: 1D ndarray of the target variable.
    X: 1D ndarray of the input features.
    alpha: learning rate.
    tol: Tolerance. If the relative change of the cost function is less than tol,
      the cost function is considered to have converged to a minimum.

  Returns:
    Parameter estimates, b and w.
  """

  max_iter = 10000  # maximum iteration
  b = np.random.uniform()  # Random initial value of b.
  w = np.random.uniform()  # Random initial value of w.

  J0 = costfcn_v(y, X, b, w)

  for i in range(max_iter):
    # Update parameters.
    dJdb, dJdw = gradient_v(y, X, b, w)
    b -= alpha * dJdb
    w -= alpha * dJdw

    J = costfcn_v(y, X, b, w)
    print(f'{i}: b = {b}, w = {w}, J = {J}')

    # Check convergence.
    if np.abs((J-J0) / J0) < tol:
      break
    else:
      J0 = J

  if i == max_iter:
    print('Maximum iteration reached before convergence.')

  return b, w

In [None]:
import numpy as np

# Define the cost function
def cost_function(X, y, b, w):
    m = len(y)
    predictions = X.dot(w) + b
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
    return cost

In [None]:
# Implement the gradient descent function
def gradient_descent(X, y, b, w, learning_rate, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)

    for i in range(iterations):
        predictions = X.dot(w) + b
        b_gradient = -(1 / m) * np.sum(y - predictions)
        w_gradient = -(1 / m) * X.T.dot(y - predictions)

        b = b - learning_rate * b_gradient
        w = w - learning_rate * w_gradient

        cost_history[i] = cost_function(X, y, b, w)

In [None]:
# Setting hyperparameters
learning_rate = 0.01
iterations = 1000

In [None]:
# Running gradient descent
estimated_b, estimated_w, cost_history = gradient_descent(X, y, initial_b, initial_w, learning_rate, iterations)