Linear Regression

In [86]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

Implementation

In [87]:
def add_b0_coef(X: np.ndarray) -> np.ndarray:
  '''
  Adds a column for intercept calculation.
  '''
  ones = np.ones((X.shape[0], 1))
  return np.hstack((ones, X))

def linear_regression(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
  '''
  Calculates the coefficients for regression.
  '''
  return np.linalg.inv(X.T @ X) @ X.T @ Y

def predict(X: np.ndarray, B: np.ndarray) -> np.ndarray:
  '''
  Predicts target values.
  '''
  return X @ B

def mean_squared_error(Y_pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
  '''
  Calculates the Mean Squared Error.
  '''
  return sum((Y - Y_pred)**2)/len(Y)

def R_squared(Y: np.ndarray, Y_pred: np.ndarray) -> np.ndarray:
  '''
  Calculates the coefficent of determination.
  '''
  nominator = sum(Y - Y_pred)**2
  denominator = sum(Y - np.mean(Y))

  return 1 - nominator/denominator


Testing implementation on the sklearn diabetes dataset.


In [88]:
data = load_diabetes()

In [89]:
# add a column of ones
X = add_b0_coef(data.data)

# calculate solution
B = linear_regression(X, data.target)

# get predictions
Y_pred = predict(X, B)

# get evaluation metrics
my_mse = mean_squared_error(Y_pred, data.target)
my_r_quared = R_squared(data.target, Y_pred)

Now, testing the regression with the default sklearn LinearRegression, for comparison.

In [90]:
# instantiate model
model = LinearRegression()

# get solution
model.fit(X, data.target)

# get predictions
y_pred = model.predict(X)

# obtain metrics
mse = mean_squared_error(data.target, y_pred )
r_squared = R_squared(data.target, y_pred)

Finally comparing results.

In [91]:
print(f'My R-Squared: {my_r_quared}, sklearn R-Squared: {r_squared}')
print(f'My MSE: {my_mse:.2f}, sklearn MSE: {mse:.2f}')
print(f'My Intercept: {B[0]:.2f}, sklearn Intercept: {model.intercept_:.2f}')
formatted= [f'{c:.2f}' for c in model.coef_[1:]]
my_formatted = [f'{c:.2f}' for c in B[1:]]
print(f'My Coefficients: {my_formatted}, \nsklearn Coefficients: {formatted}')


My R-Squared: 0.9999999998777553, sklearn R-Squared: 0.9999999999992352
My MSE: 2859.70, sklearn MSE: 2859.70
My Intercept: 152.13, sklearn Intercept: 152.13
My Coefficients: ['-10.01', '-239.82', '519.85', '324.38', '-792.18', '476.74', '101.04', '177.06', '751.27', '67.63'], 
sklearn Coefficients: ['-10.01', '-239.82', '519.85', '324.38', '-792.18', '476.74', '101.04', '177.06', '751.27', '67.63']


Accurate implementation.