## Mini-Batch Gradient Descent

Recall **batch gradient descent** formula as follows:

$$\theta_j := \theta_j - \alpha * \sum_{i=1}^m(h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)} \tag{for every $j$}$$  

While it is perfect, estimating the gradient using **all samples** can be slow.  Instead we can use **mini-batch**

$$\frac{\partial J}{\partial \theta_j} = \sum_{i=start}^{batch}(h^{(i)}-y^{(i)})x_j$$

$$\theta_j := \theta_j - \alpha * \sum_{i=start}^{batchsize}(h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)} \tag{for every $j$}$$

## Previous code

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from time import time

diabetes = load_diabetes()
print("Features: ", diabetes.feature_names)
X = diabetes.data
y = diabetes.target
m = X.shape[0]  #number of samples
n = X.shape[1]  #number of features

#polynomial transformation
# X   = PolynomialFeatures(degree = 3, include_bias=False).fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# actually you can do like this too
# X = np.insert(X, 0, 1, axis=1)
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)

Features:  ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


## Extras:

- Class
- Early stopping
- Cross validation
- Epoch - one epoch refers to a training and validation process with all the training data with one model.

## Let's start

In [19]:
from sklearn.model_selection import KFold

#Class
class LinearRegression(object):
    
    kfold = KFold(n_splits=5)

    #init
    def __init__(self, lr=0.001, num_epochs=500, bs = 50, method='batch', cv=kfold):
        self.lr         = lr
        self.num_epochs = num_epochs
        self.bs         = bs
        self.method     = method
        self.cv         = cv
    
    #mse
    def mse(self, ytrue, ypred):
        #ytrue shape:  (m, )  ==> m = number of samples
        return ((ypred - ytrue) ** 2).sum() / ytrue.shape[0]
    
    #fit
    def fit(self, X_train, y_train):
        
        #create a list for keeping kfold scores
        self.kfold_scores = list()
        
        #variable to know our loss is not improving anymore
        #if the new loss !< old loss,  we stop!  (0.01 -> tolerance - tol)
        self.val_loss_old = np.infty
        
        #cross validation
        for fold, (train_idx, val_idx) in enumerate(self.cv.split(X_train)):
            
            X_cross_train = X_train[train_idx]
            y_cross_train = y_train[train_idx]
            X_cross_val   = X_train[val_idx]
            y_cross_val   = y_train[val_idx]
            
            self.theta = np.zeros(X_cross_train.shape[1])
            
            for epoch in range(self.num_epochs):
                
                #Shuffle the data a little so that order does not impact our model
                perm = np.random.permutation(X_cross_train.shape[0]) #perm = [2 50 67 1 .... ]
                
                X_cross_train = X_cross_train[perm]
                y_cross_train = y_cross_train[perm]
                
                if self.method == 'mini':
                    for batch_idx in range(0, X_cross_train.shape[0], self.bs):
                        X_method_train = X_cross_train[batch_idx:batch_idx+self.bs, :]
                        y_method_train = y_cross_train[batch_idx:batch_idx+self.bs]
                        train_loss = self._train(X_method_train, y_method_train)
                else:
                    X_method_train = X_cross_train
                    y_method_train = y_cross_train
                    train_loss = self._train(X_method_train, y_method_train)
                    
                yhat_val = self.predict(X_cross_val)
                val_loss_new = self.mse(y_cross_val, yhat_val)
                
                #early stopping
                if np.allclose(val_loss_new, self.val_loss_old):
                    break
                self.val_loss_old = val_loss_new
                
            self.kfold_scores.append(val_loss_new)
            print(f"Fold {fold}: {val_loss_new}")
        
    
    #train
    def _train(self, X, y):
        #X shape: (m, n)
        #y shape: (m, )
        #theta shape: (n, )
        
        #1. predict
        yhat = self.predict(X)

        #2. grad
        m = X.shape[0]
        grad = (1/m) * X.T @ (yhat - y)  # (n, m) @ (m, ) - (m, ) = (m, ) ===> (n, )
                
        #3. update
        self.theta = self.theta - self.lr * grad
        
        #return
        return self.mse(y, yhat)
    
    
    #predict
    def predict(self, X):
        return X @ self.theta  #(m, n) @ (n, ) = (m, )  <===== y
    
    #get theta
    def _coef(self):
        return self.theta[1:]
    
    #get bias
    def _bias(self):
        return self.theta[0]

## Testing

In [20]:
#def __init__(self, lr=0.001, num_epochs=500, bs = 50, method='batch', cv=kfold):
model = LinearRegression(method="mini")

In [21]:
model.fit(X_train, y_train)

Fold 0: 3267.5006451756576
Fold 1: 2996.5820136565644
Fold 2: 3426.267245483871
Fold 3: 3496.341573552392
Fold 4: 3705.522536380533


In [22]:
yhat = model.predict(X_test)

In [23]:
mse = model.mse(yhat, y_test)

In [25]:
print("Test MSE:", mse)

Test MSE: 3147.2398366221414
