# Linear Regression and Gradient Descent
*VÃ­ctor Acevedo Vitvitskaya*

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
X = 2 * np.random.rand(100,1)
y = 4 + 3 * X + np.random.randn(100, 1)

Vamos a testear la solucion para Beta de MCO

In [None]:
X_b = np.c_[np.ones((100,1)), X] #Primero agregamos una columna de unos para el intercepto.

In [None]:
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

In [None]:
theta_best #Los parametros que esperamos son 4 y 3.

Realizamos una prediccion

In [None]:
X_new = np.array([[0], [2]])

In [None]:
X_new

In [None]:
X_new_b = np.c_[np.ones((2,1)), X_new]

In [None]:
y_predict = X_new_b.dot(theta_best)

In [None]:
plt.plot(X_new, y_predict, "r-")
plt.plot(X, y, "b.")
plt.axis([0, 2, 0, 15])
plt.show()

## Batch Gradient Descent
- It uses the whole batch of training data at every step
- As a result it is terribly slow on very large trainings sets
- However, it is much faster than using normal equations with thousands of features 

In [None]:
eta = 0.3 #Intentar con 0.02, 0,1 y 0.5
n_iterations = 1000
m=100

In [None]:
theta = np.random.randn(2,1) #Inicializacion aleatoria
theta

In [None]:
theta_historical = np.repeat(np.array([[0.0],[0.0]]),n_iterations, axis = 1)

In [None]:
for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y) #Using the MSE partial derivative formula
    theta = theta - eta * gradients
    theta_historical[0, iteration] = theta[0]
    theta_historical[1, iteration] = theta[1]
print(theta)

In [None]:
plt.plot(theta_historical[0], "b.")
plt.axhline(y=4, color='b', linestyle='-')
plt.plot(theta_historical[1], 'g.',)
plt.axhline(y=3, color='g', linestyle='-')
plt.show()

In [None]:
theta_path_bgd = []

def plot_gradient_descent(theta, eta, theta_path=None):
    m = len(X_b)
    plt.plot(X, y, "b.")
    n_iterations = 1000
    for iteration in range(n_iterations):
        if iteration < 10:
            y_predict = X_new_b.dot(theta)
            style = "b-" if iteration > 0 else "r--"
            plt.plot(X_new, y_predict, style)
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta = theta - eta * gradients
        if theta_path is not None:
            theta_path.append(theta)
    plt.xlabel("$x_1$", fontsize=18)
    plt.axis([0, 2, 0, 15])
    plt.title(r"$\eta = {}$".format(eta), fontsize=16)

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

plt.figure(figsize=(10,4))
plt.subplot(131); plot_gradient_descent(theta, eta=0.02)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)
plt.subplot(133); plot_gradient_descent(theta, eta=0.5)

plt.show()
print("The dashed line shows the first iteration")

## Stochastic Gradient Descent
- SGD is faster than BGD, because it uses only one observation at each step insted using all datapoints. It refreshes the observarion at each round (epoch).  
- When the cost function is irregular, SGD has more chances of reaching the global optima than BGD. Therefore randomness is good to scape from local optima.
- when using SGD, DS change the value of learning rate from big values to small values. The function that determines the values of learning rate is known as learning schedule.
- The learning schedule function uses the epoch and iteration values as inputs. 

In [None]:
theta_path_sgd = []
m = len(X_b)
np.random.seed(42)

In [None]:
n_epochs = 50
t0, t1 = 5, 50  # learning schedule hyperparameters

def learning_schedule(t):
    return t0 / (t + t1)

theta = np.random.randn(2,1)  # random initialization

for epoch in range(n_epochs):
    for i in range(m):
        if epoch == 0 and i < 20:                    
            y_predict = X_new_b.dot(theta)           
            style = "b-" if i > 0 else "r--"         
            plt.plot(X_new, y_predict, style)        
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta_path_sgd.append(theta)                 

plt.plot(X, y, "b.")                                 
plt.xlabel("$x_1$", fontsize=18)                     
plt.ylabel("$y$", rotation=0, fontsize=18)           
plt.axis([0, 2, 0, 15])                              

plt.show()                                           

## Mini Batch Gradient Descent
- Insted of computing gradients based on one observation, as SGD, MBGD uses a sample of observation per iteration.

In [None]:
theta_path_mgd = []

n_iterations = 50
minibatch_size = 20

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

t0, t1 = 10, 1000
def learning_schedule(t):
    return t0 / (t + t1)

t = 0
for epoch in range(n_iterations):
    shuffled_indices = np.random.permutation(m)
    X_b_shuffled = X_b[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for i in range(0, m, minibatch_size):
        t += 1
        xi = X_b_shuffled[i:i+minibatch_size]
        yi = y_shuffled[i:i+minibatch_size]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(t)
        theta = theta - eta * gradients
        theta_path_mgd.append(theta)

In [None]:
theta_path_bgd = np.array(theta_path_bgd)
theta_path_sgd = np.array(theta_path_sgd)
theta_path_mgd = np.array(theta_path_mgd)

In [None]:
plt.figure(figsize=(7,4))
plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], "r-s", linewidth=1, label="Stochastic")
plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], "g-+", linewidth=2, label="Mini-batch")
plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], "b-o", linewidth=3, label="Batch")
plt.legend(loc="upper left", fontsize=16)
plt.xlabel(r"$\theta_0$", fontsize=20)
plt.ylabel(r"$\theta_1$   ", fontsize=20, rotation=0)
plt.axis([2.5, 4.5, 2.3, 3.9])

plt.show()