# Gradient Descent

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interactive, fixed

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import add_dummy_feature

In [None]:
rng = np.random.RandomState(2)

## Read in dataset

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

## Split into training set and test set

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=rng)

In [None]:
features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_train = train[features]
y_train = train["SalePrice"]
X_test = test[features]
y_test = test["SalePrice"]

## Linear Regression using Scikit-Learn - reminder - used the Normal Equation

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
linear_model.intercept_, linear_model.coef_

In [None]:
mean_absolute_error(linear_model.predict(X_test), y_test)

## Gradient Descent - features must be scaled

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled = add_dummy_feature(X_train_scaled)
X_test_scaled = add_dummy_feature(X_test_scaled)

### Batch Gradient Descent

In [None]:
# Loss function for OLS regression (assumes X contains all 1s in its first column)
def J(X, y, params):
    return np.mean((X.dot(params) - y) ** 2) / 2.0

In [None]:
def batch_gradient_descent(X, y, alpha, num_iterations):
    
    m, n = X.shape
    params = rng.standard_normal(n) 
    Jvals = np.zeros(num_iterations)
    
    for i in range(num_iterations):
        params -= (alpha / m) * X.T.dot(X.dot(params) - y)
        Jvals[i] = J(X, y, params)
 
    return params, Jvals

In [None]:
# Run the Batch Gradient Descent
params, Jvals = batch_gradient_descent(X_train_scaled, y_train, alpha = 0.03, num_iterations = 200)

# Display params (bias and weights)
params

(We used the Normal Equation -above- on unscaled data and BGD on scaled data - so the results differ.)

In [None]:
def plot_loss(Jvals):
    fig, ax = plt.subplots(figsize=(20,8))
    xvals = np.linspace(1, Jvals.size, Jvals.size)
    ax = sns.scatterplot(x=xvals, y=Jvals)
    ax.set_title("J during learning")
    ax.set_ylabel("J")
    ax.set_xlabel("Number of iterations")
    plt.show()

In [None]:
plot_loss(np.minimum(Jvals, 1.75e10))

Below is an interactive version - we can see the effect of scaling and we can play with the learning rate. (The crashes/nonsensical answers are deliberate!)

In [None]:
def bgd(X, y, scale=True, alpha=0.03):
    # Scale the data, if requested
    if scale:
        X = StandardScaler().fit_transform(X)
    # Add the extra column to X
    X = add_dummy_feature(X)
    # Run the Batch Gradient Descent
    params, Jvals = batch_gradient_descent(X, y, alpha, num_iterations = 3000)
    # Display bias and weights
    print("Parameters: ", params)
    fig, ax = plt.subplots(figsize=(20,8))
    xvals = np.linspace(1, Jvals.size, Jvals.size)
    ax = sns.scatterplot(x=xvals, y=Jvals)
    ax.set_title("J during learning")
    ax.set_ylabel("J")
    ax.set_xlabel("Number of iterations")
    # plt.ylim(3500, 50000)
    plt.show()
    
interactive_plot = interactive(bgd, {'manual': True},
    X=fixed(X_train), y=fixed(y_train),
    scale=True, 
    alpha=[("0.00009", 0.00009), ("0.0009", 0.0009), ("0.009", 0.009), ("0.09", 0.09), ("0.9", 0.9)]) 
interactive_plot

### Stochastic Gradient Descent

In [None]:
def stochastic_gradient_descent(X, y, alpha, num_epochs):

    np.random.seed(2)
    m, n = X.shape
    params = rng.standard_normal(n) 
    Jvals = np.zeros(num_epochs * m)
    
    for epoch in range(num_epochs):
        perm = rng.permutation(m)
        for i in perm:
            x_i = X[i:i+1]
            y_i = y[i:i+1]
            params -= alpha * x_i.T.dot(x_i.dot(params) - y_i)
            Jvals[epoch * m + i] = J(X, y, params)
 
    return params, Jvals

In [None]:
# Run the Stochastic Gradient Descent
params, Jvals = stochastic_gradient_descent(X_train_scaled, y_train, alpha = 0.003, num_epochs = 200)

# Display params
params

In [None]:
plot_loss(np.minimum(Jvals[:10000], 1.75e10))

### SGD with Simulated Annealing

In [None]:
def learning_schedule(t):
    return 5 / (t + 50)

In [None]:
def sgd_simulated_annealing(X, y, alpha, num_epochs):

    np.random.seed(2)
    m, n = X.shape
    params = rng.standard_normal(n) 
    Jvals = np.zeros(num_epochs * m)
    
    for epoch in range(num_epochs):
        perm = rng.permutation(m)
        for i in perm:
            x_i = X[i:i+1]
            y_i = y[i:i+1]
            alpha = learning_schedule(epoch * m + i)
            params -= alpha * x_i.T.dot(x_i.dot(params) - y_i)
            Jvals[epoch * m + i] = J(X, y, params)
 
    return params, Jvals

In [None]:
# Run the Stochastic Gradient Descent with Simulated Annealing
params, Jvals = sgd_simulated_annealing(X_train_scaled, y_train, alpha = 0.003, num_epochs = 200)

# Display params
params

In [None]:
plot_loss(np.minimum(Jvals[:10000], 1.75e10))

### Mini-Batch Gradient Descent

In [None]:
def mini_batch_gradient_descent(X, y, alpha, num_epochs, batch_size):

    np.random.seed(2)
    m, n = X.shape
    params = rng.standard_normal(n) 
    Jvals = np.zeros(num_epochs * (m // batch_size))
    
    for epoch in range(num_epochs):
        perm = rng.permutation(m)
        for i in range(m // batch_size):
            indices = perm[i*batch_size:i*batch_size+batch_size]
            X_batch = X[perm]
            y_batch = y.iloc[perm]
            params -= (alpha / m) * X_batch.T.dot(X_batch.dot(params) - y_batch)
            Jvals[epoch * (m // batch_size) + i] = J(X, y, params)
 
    return params, Jvals

In [None]:
# Run the Mini-Batch Gradient Descent
params, Jvals = mini_batch_gradient_descent(X_train_scaled, y_train, alpha = 0.003, num_epochs = 200, batch_size = 32)

# Display params
params

In [None]:
plot_loss(np.minimum(Jvals[:10000], 1.75e10))