# Gradient Descent

## 1. Load data

In [1]:
import numpy as np

X_train = np.array([[1, 1], [2, 2], [2, 9], [1, 14]])
X_train.shape #m, n

y_train = np.array([3, 4, 10, 13])
y_train.shape #m, 

X_val = np.array([[1.5, 2], [2, 3], [1, 9], [1, 13]])
X_val.shape #m, n

y_val = np.array([3, 4, 9.5, 13.5])
y_val.shape #m, 

X_test = np.array([[2, 1], [1, 2], [2, 8], [2, 13]])
X_test.shape #m, n

y_test = np.array([2, 5, 9, 14])
y_test.shape #m, 

# assert X_train.shape[1] == X_test.shape[1]


(4,)

## 2. Modeling

Imagine you do:
1. imputation - cleaning
2. scaling

### 2.1 Definition

In [2]:
# for definitions
m_train, n_train = X_train.shape
m_val, n_val     = X_val.shape
m_test, n_test   = X_test.shape
num_epochs       = 50
theta            = np.zeros(  (n_train,   )  )
lr               = 0.001
old_val_loss     = np.infty
tolerance        = 0.001

### 2.2 Model

In [3]:
def mse(yhat, ytrue):
    return ( (yhat - ytrue) ** 2  ).sum() / yhat.shape[0]

In [4]:
#1. loop according to epoch
for i in range(num_epochs):  #0, 1, 2, 3, 4
    
    #2. predict
    yhat_train = X_train @ theta # (m, n) x (n, 1) = (m, 1)  #<---same shape as y 
    
    #3. gradient
    #X^T (h - y)
    grad = (X_train.T @ (yhat_train - y_train) ) / m_train  #(n, m) @ (m, 1) = (n, 1)
    
    #4. update
    theta = theta - lr * grad  # (n, 1) - (1) (n, 1) = (n, 1)
    
    train_loss =  mse(yhat_train, y_train)

    #validation loss
    #1. take the current theta, and do prediction with the validation set
    yhat_val = X_val @ theta
    #2. calculate the loss with y_val
    val_loss = mse(yhat_val, y_val)
    #3. finish
    
    #########early stopping###################
    #1. if new val_loss is very close to old val_loss by 0.001, you stop everything ok!
    if np.abs(old_val_loss - val_loss) < tolerance:
        print(f'Stopped at epoch {i} - :-)')
        break
    #2. otherwise continue
    old_val_loss = val_loss
    
    print(f"Epoch: {i+1}  : {train_loss=:.3f} : {val_loss=:.3f}")

Epoch: 1  : train_loss=73.500 : val_loss=64.652
Epoch: 2  : train_loss=63.614 : val_loss=56.237
Epoch: 3  : train_loss=55.096 : val_loss=48.952
Epoch: 4  : train_loss=47.755 : val_loss=42.644
Epoch: 5  : train_loss=41.428 : val_loss=37.180
Epoch: 6  : train_loss=35.977 : val_loss=32.444
Epoch: 7  : train_loss=31.278 : val_loss=28.338
Epoch: 8  : train_loss=27.229 : val_loss=24.777
Epoch: 9  : train_loss=23.740 : val_loss=21.687
Epoch: 10  : train_loss=20.732 : val_loss=19.004
Epoch: 11  : train_loss=18.140 : val_loss=16.673
Epoch: 12  : train_loss=15.906 : val_loss=14.648
Epoch: 13  : train_loss=13.980 : val_loss=12.886
Epoch: 14  : train_loss=12.320 : val_loss=11.354
Epoch: 15  : train_loss=10.889 : val_loss=10.019
Epoch: 16  : train_loss=9.656 : val_loss=8.856
Epoch: 17  : train_loss=8.592 : val_loss=7.842
Epoch: 18  : train_loss=7.675 : val_loss=6.956
Epoch: 19  : train_loss=6.884 : val_loss=6.183
Epoch: 20  : train_loss=6.202 : val_loss=5.507
Epoch: 21  : train_loss=5.614 : val_los

## 3. Testing

In [5]:
theta

array([0.20266781, 0.95531753])

In [6]:
yhat_test = X_test @ theta
mse       = mse(  yhat_test , y_test  )
print(f"{mse=:.2f}")

assert mse > 100

mse=2.76


AssertionError: 

## 4. Inference

In [None]:
X_train[0], y_train[0], X_train[1], y_train[1]

In [None]:
#1. randomly create a sample
chaky_test_case = np.array([ [1.5, 0.5] ])
assert len(chaky_test_case.shape) == 2  #m, n

#2. predict
predict_chaky_test_case = chaky_test_case @ theta
predict_chaky_test_case_int = float(predict_chaky_test_case)
print(f"Answer:  {predict_chaky_test_case_int: .2f}.  Does it satisfy you?")

#3. check with your instinct / ask expert whether is correct

## 5. Feature importance

In [None]:
theta

In [None]:
#assume you have standardize your feature
#feature 2 is more important
    # 0.30 / (0.04 + 0.30)
    
#feature 1 is less important
    # 0.04 / (0.04 + 0.030)