In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


### Synthetic Data Generation
Generate 100 data points when 0 and 1. Add gaussian noise to the data generated to micmick real-world scenarios. 

In [5]:
true_b = 1
true_w = 2
N = 100

# data generation
np.random.seed(0)
x = np.random.rand(N,1)
epsilon = (0.1 * np.random.randn(N,1))
y = true_b + true_w * x + epsilon


### Train-Validation-Test Split
Spliting is always done before any preprocessing tasks to avaoid data leakage

In [6]:
# Shuffle the indices
idx = np.arange(N)
np.random.shuffle(idx)

# First 80 indices for train data
train_idx = idx[:80]
# Last 20 indices for validation data
val_idx = idx[80:]

# generate test and train data
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]



### Initialization W, b

In [8]:
# initialize parameter w and b
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(1)

print("Initial b: ", b)
print("Initial w: ", w)

Initial b:  [0.49671415]
Initial w:  [-0.1382643]


### Compute Model's Predictions

In [9]:
# model prediction
def predict(x, w, b):
    y_hat = b + w * x
    return y_hat

y_hat = predict(x_train, w, b)

# loss function
error = y_hat - y_train

# mean squared error
loss = (error ** 2).mean()
print("Initial loss: ", loss)

Initial loss:  2.5521473884714103


### Loss surface

In [12]:
b_range = np.linspace(true_b - 3, true_b + 3, 101)
w_range = np.linspace(true_w - 3, true_w + 3, 101)

# generate various combinations of w and b
bs, ws = np.meshgrid(b_range, w_range)
bs.shape, ws.shape

((101, 101), (101, 101))

In [14]:
all_predictions = np.apply_along_axis(
    func1d=lambda x: bs + ws * x,
    axis=1,
    arr=x_train
)
all_predictions.shape

(80, 101, 101)

In [16]:
all_labels = y_train.reshape(-1,1,1)
all_labels.shape

(80, 1, 1)

In [None]:
all_errors = all_predictions - all_labels
all_errors.shape
all_losses = (all_errors ** 2).mean(axis=0)

### Compute Gradient Descent
Gradient Descent algorithm is used to update the values of w and b until we reach an optimal solution. we do this by taking partial derivatives of the loss function with respect to w and b.
$$MSE\ is \ given \ by ==> \frac{1}{n}\sum_{i=1}^{n}(\hat{y}_{i} - y_i)^2$$
$$ where\ \hat{y}_{i} = wx_{i} + b $$
$$\frac{\partial{MSE}}{\partial{b}} =  \frac{2}{n}\sum_{i=1}^{n}(\hat{y}_{i} - y_i)$$

$$\frac{\partial{MSE}}{\partial{w}} = \frac{2}{n}\sum_{i=1}^{n} x_{i}  (\hat{y}_{i} - y_i)$$

In [18]:
# compute gradients for w and b
b_grad = 2 * error.mean()
w_grad = 2 * (error * x_train).mean()
print("b_grad: ", b_grad)
print("w_grad: ", w_grad)

b_grad:  -2.952298488769403
w_grad:  -1.6541516719074536


### Update Parameters w and b
$$ b = b - \eta \frac{\partial{MSE}}{\partial{b}}$$
$$ w = w - \eta \frac{\partial{MSE}}{\partial{w}}$$
$$where \ \eta \ is \ the \ learning \ rate.$$

In [20]:
lr = 0.1
print('orginal b: ', b)
print('orginal w: ', w)

# update w and b using gradients and learning rate
b = b - lr * b_grad
w = w - lr * w_grad

print('Updated b: ', b)
print('Updated w: ', w)

orginal b:  [0.791944]
orginal w:  [0.02715087]
Updated b:  [1.08717385]
Updated w:  [0.19256603]


### Normalize Data

Use the mean and standard deviation to scale validation data. 
Data Normalization is done after splitting the data to avoid data leakage. 

In [None]:
scaler = StandardScaler(with_mean=True, with_std=True)
# fit on the training data
scaler.fit(x_train)

# use the already fit scaler to transform training and validation data
scaled_x_train = scaler.transform(x_train)
scaled_x_val = scaler.transform(x_val)