In [None]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, model_selection, utils

# Linear Regression from Scratch with Numpy

## Prepare the data

In [None]:
datasets.make_regression?

In [None]:
prng = np.random.RandomState(42)
X, y, theta = datasets.make_regression(
    n_features=1,
    n_informative=1,
    noise=10.0,
    coef=True,
    random_state=prng
)

In [None]:
_ = plt.plot(X[:, 0], y, 'o')
_ = plt.xlabel(r"$X_1$", fontsize=15)
_ = plt.ylabel("y", fontsize=15, rotation=0)
_ = plt.grid()

In [None]:
theta

### Train-test split

In [None]:
shuffled_features, shuffled_targets = utils.shuffle(X, y, random_state=prng)
train_features, train_targets = shuffled_features[:80], shuffled_targets[:80]
test_features, test_targets = shuffled_features[80:], shuffled_targets[80:]

## Training

In [None]:
def predict(X, theta):
    return X @ theta


def mse_loss(y, y_hat):
    return 0.5 * np.mean((y - y_hat)**2)


def mse_grad(X, y, y_hat):
    m, *_ = y.shape
    return -(1 / m) * (X.T @ (y - y_hat))



### Analytic Solution

In [None]:
def linear_regression(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y

In [None]:
theta_hat = linear_regression(train_features, train_targets)

In [None]:
theta_hat

In [None]:
train_predictions = predict(train_features, theta_hat)
mse_loss(train_targets, train_predictions)

In [None]:
test_predictions = predict(test_features, theta_hat)
mse_loss(test_targets, test_predictions)

### Stochastic Gradient Descent

In [None]:
def model_fn(X, learned_parameters):
    return predict(X, learned_parameters)


def loss_fn(y, y_hat):
    return mse_loss(y, y_hat)


def grad_fn(X, y, y_hat):
    return mse_grad(X, y, y_hat)


In [None]:
# initialize weights
learned_parameters = prng.normal(loc=0, scale=1, size=(1,))

learning_rate = 0.001
batch_size = 1
epochs = 100
log_epochs = 10

for epoch in range(epochs):

    total_loss = 0.0
    for batch_ixs in utils.gen_batches(len(train_targets), batch_size):
        features, target = train_features[batch_ixs], train_targets[batch_ixs]

        # forward pass
        predictions = model_fn(features, learned_parameters)
        loss = loss_fn(target, predictions)
        total_loss += loss

        # backward pass
        grad = grad_fn(features, target, predictions)
        learned_parameters -= grad * learning_rate

    if epoch % log_epochs == 0:
        print(f"Epoch {epoch}  Loss {total_loss / len(train_targets):.4f}")

In [None]:
print(f"Learned Parameters:\n {learned_parameters}")

In [None]:
total_loss = 0
for batch_ixs in utils.gen_batches(len(test_targets), batch_size):
    features, target = test_features[batch_ixs], test_targets[batch_ixs]
    predictions = model_fn(features, learned_parameters)
    loss = loss_fn(target, predictions)
    total_loss += loss

print(f"Average test loss: {total_loss / len(test_targets)}")

In [None]:
_ = plt.plot(X[:, 0], y, 'o')
_ = plt.xlabel(r"$X_1$", fontsize=15)
_ = plt.ylabel("y", fontsize=15, rotation=0)

X_new = np.linspace(-3, 3, 1000).reshape((-1, 1))
y_new = model_fn(X_new, learned_parameters)

_ = plt.plot(X_new[:, 0], y_new)
plt.grid()

## Example using a real data set

In [None]:
features, targets = datasets.load_diabetes(
    return_X_y=True,
    as_frame=False,
    scaled=True
)

In [None]:
features

In [None]:
targets

### Train-test split

In [None]:
train_features, test_features, train_targets, test_targets = (
    model_selection.train_test_split(
        features,
        targets,
        random_state=prng,
        test_size=0.1
    )
)

### Analytic Solution

In [None]:
learned_parameters = linear_regression(train_features, train_targets)

In [None]:
train_predictions = predict(train_features, learned_parameters)
training_loss = loss_fn(train_targets, train_predictions)
print(f"Training loss: {np.sqrt(training_loss)}")

In [None]:
test_predictions = predict(test_features, learned_parameters)
test_loss = loss_fn(test_targets, test_predictions)
print(f"Test loss: {np.sqrt(test_loss)}")

### Using Stochastic Gradient Descent

In [None]:
# initialize weights
_, n = train_features.shape
learned_parameters = prng.normal(loc=0, scale=1, size=(n,))

learning_rate = 0.01
batch_size = 32
epochs = 1000
log_epochs = 100

for epoch in range(epochs):

    total_loss = 0.0
    for batch_ixs in utils.gen_batches(len(train_targets), batch_size):
        features = train_features[batch_ixs]
        target = train_targets[batch_ixs]

        # forward pass
        predictions = model_fn(features, learned_parameters)
        loss = loss_fn(target, predictions)
        total_loss += loss

        # backward pass
        grad = grad_fn(features, target, predictions)
        learned_parameters -= grad * learning_rate

    if epoch % log_epochs == 0:
        print(f'Epoch {epoch}  Loss {total_loss / len(train_targets):.4f}')

In [None]:
train_predictions = model_fn(train_features, learned_parameters)
training_loss = loss_fn(train_targets, train_predictions)
print(f"Training loss: {np.sqrt(training_loss)}")

In [None]:
test_predictions = model_fn(test_features, learned_parameters)
test_loss = loss_fn(test_targets, test_predictions)
print(f"Test loss: {np.sqrt(test_loss)}")

### Exercise

Compare the training loss and the testing loss. Is the model underfitting or overfitting? How can you tell?