# Linear regression in 4 acts

In [None]:
import numpy as np
import matplotlib.pyplot as plt

### Create data set

In [None]:
np.random.seed(123)

In [None]:
xs = np.random.normal(size=(25,)).astype('float32')
ys = 3*xs -1 + np.random.normal(scale=0.3, size=xs.shape).astype('float32')

In [None]:
plt.scatter(xs, ys)
pass

#### Initial parameter guess

In [None]:
θ0 = np.random.random(2)
θ0

### Analytic solution

From linear algebra, we know that the analytic *least squares* solution can be found by projecting onto the column space of $X$ to give the normal equations

$$
\hat{\theta} = (X^TX)^{-1}X^T y
$$

where $X$ is the matrix augmented with an column of $\mathbf{1}$ to represent the intercept or bias.

#### Solving normal equations directly

In [None]:
X = np.c_[np.ones_like(xs), xs]
b, w = np.linalg.solve(X.T@X, X.T@ys)
print(f'w = {w:.2f}, b = {b:.2f}')

#### Using library function

In [None]:
(b, w), resid, rank, s = np.linalg.lstsq(X, ys, rcond=None)
print(f'w = {w:.2f}, b = {b:.2f}')

In [None]:
plt.scatter(xs, ys)
plt.plot(xs, w*xs + b, c='red')
pass

### Using `numpy` and `scipy`

In [None]:
from scipy.optimize import approx_fprime

This uses numerical finite approximation to get the gradient. All the other solutions use automatic differentiation.

In [None]:
def model(θ, x):
    w, b = θ
    return w*x + b

In [None]:
def loss_numpy(θ, x, y):
    yhat = model(θ, x)
    return np.mean((yhat - y)**2)

In [None]:
def update_numpy(θ, x, y, lr, eps):
    return θ - lr*approx_fprime(θ, loss_numpy, eps, x, y)

In [None]:
def train_numpy(θ, x, y, n_iter=100, lr=0.1, eps=0.01):
    for i in range(n_iter):
        θ = update_numpy(θ, x, y, lr, eps)
    return θ

In [None]:
%%time

θ = train_numpy(θ0, xs, ys)
w, b = θ
print(f'w = {w:.2f}, b = {b:.2f}')

In [None]:
plt.scatter(xs, ys)
plt.plot(xs, model(θ, xs), c='red')
pass

### Using `jax`

In [None]:
import jax
import jax.numpy as jnp

In [None]:
def loss_jax(θ, x, y):
    yhat = model(θ, x)
    return jnp.mean((yhat - y)**2)

In [None]:
@jax.jit
def update_jax(θ, x, y, lr):
    return θ - lr * jax.grad(loss_jax)(θ, x, y)

In [None]:
def train_jax(θ, x, y, n_iter=100, lr=0.1):
    for i in range(n_iter):
        θ = update_jax(θ, x, y, lr)
    return θ

In [None]:
%%time

θ = train_jax(θ0, xs, ys)
w, b = θ
print(f'w = {w:.2f}, b = {b:.2f}')

In [None]:
plt.scatter(xs, ys)
plt.plot(xs, model(θ, xs), c='red')
pass

### Using `tensorflow` and `keras`

In [None]:
import tensorflow as tf

In [None]:
def custom_loss(yhat, y):
    return tf.reduce_mean((yhat - y)**2)

In [None]:
model_keras = tf.keras.Sequential(
    tf.keras.layers.Dense(1, input_shape=[1,], activation='linear')
)

In [None]:
model_keras.compile(
    optimizer=tf.optimizers.SGD(learning_rate=0.1),
    loss='mse'
    # loss = custom_loss
)

In [None]:
history = model_keras.fit(xs, ys, epochs=100, verbose=0)

In [None]:
np.array(model_keras.layers[0].weights)

In [None]:
plt.scatter(xs, ys)
plt.plot(xs, model_keras.predict(xs), c='red')
pass

### Using PyTorch

In [None]:
import torch
from torch import optim, nn

In [None]:
model_torch = nn.Sequential(
    nn.Linear(in_features=1, out_features=1)
)

In [None]:
optimizer = optim.SGD(model_torch.parameters(), lr=0.1)

In [None]:
loss_torch = nn.MSELoss()

You might want to code your own loss function that returns a Torch tensor

```python
def loss_torch(yhat, y): 
    return torch.mean((yhat - y)**2) 
```

In [None]:
xs_torch = torch.tensor(xs).unsqueeze(1)
ys_torch = torch.tensor(ys).unsqueeze(1)

#### Inputs have shape batch_size × num_features

In [None]:
xs_torch.shape, ys_torch.shape

In [None]:
for epoch in range(100):
    yhat = model_torch(xs_torch)
    loss = loss_torch(yhat, ys_torch)

    optimizer.zero_grad() 
    loss.backward() 
    optimizer.step()

In [None]:
model_torch.state_dict()

In [None]:
plt.scatter(xs, ys)
plt.plot(xs, model_torch(xs_torch).detach(), c='red')
pass