Week 41.

In [20]:
import numpy as np

#X: input
#y: target
#epochs: number of runs
#M: used for batchsize only for sgd
#learning rate: step size tuning parameter
#lmbda: if ridge lambda used



def linear_GD(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    H = (2.0 / n) * X.T @ X
    EigValues, EigVectors = np.linalg.eig(H)
    beta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 1.0 / np.max(EigValues)

    for i in range(n_epochs):
        gradient = (2.0 / n) * X.T @ (X @ beta - y)
        beta -= learning_rate * gradient

    #print(beta)
    return beta

def ridge_GD(degree, X, y, n_epochs,M, learning_rate, lmbda):
    n = len(X)
    beta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 0.1

    for i in range(n_epochs):
        gradients = 2.0 / n * X.T @ (X @ beta - y) + 2 * lmbda * beta
        beta -= learning_rate * gradients

    #print(beta)
    return beta

def linear_momentum_GD(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    H = (2.0 / n) * X.T @ X
    EigValues, EigVectors = np.linalg.eig(H)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 1.0 / np.max(EigValues)

    change = 0.0
    delta_momentum = 0.99

    for i in range(n_epochs):
        gradients = (2.0 / n) * X.T @ (X @ theta - y)
        
        new_change = learning_rate * gradients + delta_momentum * change
       
        
        theta -= new_change
        
        change = new_change

    #print(theta)
    return theta

def ridge_momentum_GD(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 0.1

    change = 0.0
    delta_momentum = 0.3

    for iter in range(n_epochs):
        gradients = 2.0 / n * X.T @ (X @ (theta) - y) + 2 * lmbda * theta
        #rint( lmbda)
        new_change = learning_rate * gradients + delta_momentum * change
        #rint(learning_rate)
        #print(delta_momentum)
        #rint(change)
        theta -= new_change
        change = new_change

    #print(theta)
    return theta

def SGD_linear(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    beta = np.random.randn(degree, 1)
    def learning_schedule(t):
            return t0 / (t + t1)
       
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            learning_rate =learning_schedule(epoch * M + i) 
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ beta) - y_mini)
            beta -= learning_rate * gradients

    return beta

def SGD_ridge(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    beta = np.random.randn(degree, 1)
    def learning_schedule(t):
            return t0 / (t + t1)
   
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            learning_rate =learning_schedule(epoch * M + i)
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ beta) - y_mini) + 2 * lmbda * beta
            beta -= learning_rate * gradients

    return beta





def SGD_momentum_linear(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
  
    delta_momentum = 0.3
   
    change = 0.0
    delta_momentum = 0.03

    def learning_schedule(t):
            return t0 / (t + t1)
        
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini)
            eta=learning_schedule(epoch * M + i)
            new_change = eta * gradients + delta_momentum * change
            theta = theta - new_change
            change = new_change

    return theta

def SGD_momentum_ridge(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
  
    delta_momentum = 0.3
    
    change = 0.0
    delta_momentum = 0.03
    delta = 1e-8

    def learning_schedule(t):
            return t0 / (t + t1)
        
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini)+ 2 * lmbda * theta
            eta=learning_schedule(epoch * M + i)
            new_change = eta * gradients + delta_momentum * change
            theta = theta - new_change
            change = new_change

    return theta

def Adagrad_linear_momentum_SD(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    delta_momentum = 0.03
    change = 0.0
    t0, t1 = 5, 50
    delta = 1e-8
    Giter = 0.0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        Giter=0
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini)
            Giter += gradients * gradients
            eta = learning_rate
            new_change = gradients * eta / (delta + np.sqrt(Giter)) + delta_momentum * change
            theta -= new_change
            change = new_change

    return theta

def Adagrad_ridge_momentum_SD(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    delta_momentum = 0.03
    change = 0.0
    t0, t1 = 5, 50
    delta = 1e-8
    Giter = 0.0
    
    for epoch in range(n_epochs):
        Giter=0
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini) + 2 * lmbda * theta
            Giter += gradients * gradients
            eta = learning_rate
            new_change = gradients * eta / (delta + np.sqrt(Giter)) + delta_momentum * change
            theta -= new_change
            change = new_change

    return theta








def Adagrad_ridge_SD(degree,X, y, n_iterations,m,learning_rate, lmbda ):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        G=0
        random_index = np.random.randint(m)
        for i in range(m):
            
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = 2 * xi.T.dot(xi.dot(theta) - yi)+2 * lmbda * theta 
            G += gradients ** 2
            theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
    
    return theta


def Adagrad_linear_SD(degree,X, y, n_iterations,m,learning_rate, lmbda ):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        G=0
        random_index = np.random.randint(m)
        for i in range(m):
            
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
            G += gradients ** 2
            theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
    
    return theta

def Adagrad_linear_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = None
    epsilon=1e-1
    
    for iteration in range(n_iterations):
        gradients = 2/m * X.T.dot(X.dot(theta) - y)
        if G is None:
            G = np.zeros((gradients.shape[0], gradients.shape[0]))
        G += np.diag(gradients @ gradients.T)    
        G_t_inverse = 1 / (
            epsilon + np.sqrt(np.reshape(np.diagonal(G), (G.shape[0], 1)))
        )
        theta = theta - learning_rate * gradients * G_t_inverse
       
    return theta


def Adagrad_ridge_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        gradients = 2/m * X.T.dot(X.dot(theta) - y)+ 2 * lmbda * theta
        G += gradients ** 2
        theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
       
    return theta




def Adagrad_linear_momentum_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    change = 0.0
    delta = 1e-8
    delta_momentum = 0.6
    
    for iteration in range(n_iterations):
        gradients = 2 / m * X.T.dot(X.dot(theta) - y)
        G += gradients * gradients
        eta = learning_rate
        new_change = eta / (delta + np.sqrt(G)) * gradients + delta_momentum * change
        theta -= new_change
        change = new_change
        
    return theta

def Adagrad_ridge_momentum_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    m, n = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    change = 0.0
    delta = 1e-8
    delta_momentum = 0.6
    
    for iteration in range(n_iterations):
        gradients = 2 / m * X.T.dot(X.dot(theta) - y) + 2 * lmbda * theta
        G += gradients * gradients
        eta = learning_rate
        new_change = eta / (delta + np.sqrt(G)) * gradients + delta_momentum * change
        theta -= new_change
        change = new_change
       
    return theta



def RMSprop_linear(degree, X, y, n_epochs, M, learning_rate, lmbda):
    m, n = X.shape
    n = len(X)
    theta = np.random.randn(degree, 1)
    G_squared = 0.0
    delta = 1e-8
    rho = 0.99
    eta = learning_rate
   
    for epoch in range(n_epochs):
        random_index = np.random.randint(m)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini)
            G_squared = rho * G_squared + (1 - rho) * gradients * gradients
            update = gradients * learning_rate / (np.sqrt(G_squared) + delta)
            theta -= update

    return theta

def RMSprop_ridge(degree, X, y, n_epochs, M, learning_rate, lmbda):
    m, n = X.shape
    n = len(X)
    theta = np.random.randn(degree, 1)
    G_squared = 0.0
    delta = 1e-8
    rho = 0.99
    eta = learning_rate
    
    for epoch in range(n_epochs):
        random_index = np.random.randint(m)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini) + 2 * lmbda * theta
            G_squared = rho * G_squared + (1 - rho) * gradients * gradients
            update = gradients * learning_rate / (np.sqrt(G_squared) + delta)
            theta -= update

    return theta

def Adam_ridge(degree, X, y, n_epochs, M, learning_rate, lmbda):
    m, n = X.shape
    n = len(X)
    theta = np.random.randn(degree, 1)
    iter=0.0
    delta = 1e-7
    beta1 = 0.9
    beta2 = 0.999

    for epoch in range(n_epochs):
        first_moment = 0.0
        second_moment = 0.0
        iter+=1
        random_index = np.random.randint(m)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini) + 2 * lmbda * theta
            first_moment = beta1 * first_moment + (1 - beta1) * gradients
            second_moment = beta2 * second_moment + (1 - beta2) * gradients * gradients
            first_bias_corrected = first_moment / (1 - beta1 ** (iter))
            second_bias_corrected = second_moment / (1 - beta2 ** (iter))
            update = first_bias_corrected * learning_rate / (np.sqrt(second_bias_corrected) + delta)
            theta -= update

    return theta

def Adam_linear(degree, X, y, n_epochs, M, learning_rate, lmbda):
    m, n = X.shape
    n = len(X)
    theta = np.random.randn(degree, 1)
    iter=0.0
    delta = 1e-7
    beta1 = 0.9
    beta2 = 0.999

    for epoch in range(n_epochs):
        first_moment = 0.0
        second_moment = 0.0
        iter+=1
        random_index = np.random.randint(m)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (2.0 / M) * X_mini.T @ ((X_mini @ theta) - y_mini) 
            first_moment = beta1 * first_moment + (1 - beta1) * gradients
            second_moment = beta2 * second_moment + (1 - beta2) * gradients * gradients
            first_bias_corrected = first_moment / (1 - beta1 ** (iter))
            second_bias_corrected = second_moment / (1 - beta2 ** (iter))
            update = first_bias_corrected * learning_rate / (np.sqrt(second_bias_corrected) + delta)
            theta -= update

    return theta



In [13]:
import pandas as pd

np.random.seed(0)
n = 2000
x = 2 * np.random.rand(n, 1)
y = 4 + 3 * x + np.random.randn(n, 1)

X = np.c_[np.ones((n, 1)), x]

t_1_values = [50, 70]  # t_1 values to be tested
t_0_values = [5, 10]  # t_0 values to be tested

M = [5, 10]
num_epochs = [ 60, 200]
ridge_hyperparameters = [ 0.02, 0.03]
results_sgd_linear = []
results_sgd_ridge = []
results_sgd_momentum_linear = []
results_sgd_momentum_ridge = []

# Custom Optimization Functions
optimization_functions = [
    SGD_linear, SGD_ridge, SGD_momentum_linear, SGD_momentum_ridge
]

for func in optimization_functions:
    for t_1 in t_1_values:
        for t_0 in t_0_values:
            for batch_size in M:
                for n_epochs in num_epochs:
                    #print("here")
                    if "SGD_linear" in func.__name__:
                        # Linear functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_linear.append((func.__name__, t_1, t_0,batch_size, n_epochs, lmbda, beta))
                    elif "SGD_ridge" in func.__name__:
                        # Ridge functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_ridge.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))
                    elif "SGD_momentum_linear" in func.__name__:
                        # Momentum Linear functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_momentum_linear.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))
                    elif "SGD_momentum_ridge" in func.__name__:
                        # Momentum Ridge functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_momentum_ridge.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))

# Convert results to dataframes
columns = ["Method", "t_1", "t_0", "Batch Size", "Epochs", "Lambda", "Beta"]
df_sgd_linear = pd.DataFrame(results_sgd_linear, columns=columns)
df_sgd_ridge = pd.DataFrame(results_sgd_ridge, columns=columns)
df_sgd_momentum_linear = pd.DataFrame(results_sgd_momentum_linear, columns=columns)
df_sgd_momentum_ridge = pd.DataFrame(results_sgd_momentum_ridge, columns=columns)


df_sgd_linear = df_sgd_linear.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_ridge = df_sgd_ridge.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_momentum_linear= df_sgd_momentum_linear.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_momentum_ridge = df_sgd_momentum_ridge.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")


df_sgd_linear = df_sgd_linear.drop(columns=[ "Lambda"])

# Drop the second column ('Learning Rate') from 'results_ridge'
df_sgd_momentum_linear = df_sgd_momentum_linear.drop(columns=["Lambda"])

pd.set_option('display.max_rows', None)

from IPython.display import display
display(df_sgd_linear)
display(df_sgd_ridge)
display(df_sgd_momentum_linear)
display(df_sgd_momentum_ridge)







Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Beta
0,SGD_linear,50,5,5,60,"[[3.9633033118263667], [2.967630028118333]]"
2,SGD_linear,50,5,5,200,"[[4.017710805421551], [2.992404373035771]]"
4,SGD_linear,50,5,10,60,"[[3.979301179372519], [3.0202331054570206]]"
6,SGD_linear,50,5,10,200,"[[3.9857920037135344], [3.023026354309524]]"
8,SGD_linear,50,10,5,60,"[[4.013787691472411], [3.029523042386755]]"
10,SGD_linear,50,10,5,200,"[[3.990894779710191], [3.0399703250408483]]"
12,SGD_linear,50,10,10,60,"[[3.906175487531253], [2.9481374032515046]]"
14,SGD_linear,50,10,10,200,"[[3.9482095808607434], [2.9978913569813206]]"
16,SGD_linear,70,5,5,60,"[[3.9225581193914496], [3.0125171774953925]]"
18,SGD_linear,70,5,5,200,"[[3.9304844168643647], [3.0415795887905]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Lambda,Beta
0,SGD_ridge,50,5,5,60,0.02,"[[3.8449707732844094], [2.972431281162258]]"
2,SGD_ridge,50,5,5,200,0.02,"[[3.842603372009857], [3.045239525197075]]"
4,SGD_ridge,50,5,10,60,0.02,"[[3.805195577765326], [3.0145903289098803]]"
6,SGD_ridge,50,5,10,200,0.02,"[[3.869132312423813], [3.0732005356572873]]"
8,SGD_ridge,50,10,5,60,0.02,"[[3.930269756225025], [3.0860276942989504]]"
10,SGD_ridge,50,10,5,200,0.02,"[[3.819699506126235], [3.0617671287609354]]"
12,SGD_ridge,50,10,10,60,0.02,"[[3.8128008162985036], [2.997885203089992]]"
14,SGD_ridge,50,10,10,200,0.02,"[[3.8605752895554697], [3.0827026375033095]]"
16,SGD_ridge,70,5,5,60,0.02,"[[3.8729075946924265], [3.0897772556999814]]"
18,SGD_ridge,70,5,5,200,0.02,"[[3.832125959700996], [3.0777375262471143]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Beta
0,SGD_momentum_linear,50,5,5,60,"[[3.9862784802547697], [3.1090812679200335]]"
2,SGD_momentum_linear,50,5,5,200,"[[3.932252190363189], [3.001880384701028]]"
4,SGD_momentum_linear,50,5,10,60,"[[3.9904642429894848], [3.003355724862789]]"
6,SGD_momentum_linear,50,5,10,200,"[[3.94802500738118], [2.997388028020784]]"
8,SGD_momentum_linear,50,10,5,60,"[[3.952601812463265], [2.9781625475727287]]"
10,SGD_momentum_linear,50,10,5,200,"[[3.8657881119991617], [2.8526894163154073]]"
12,SGD_momentum_linear,50,10,10,60,"[[3.971136876564071], [3.014700218133625]]"
14,SGD_momentum_linear,50,10,10,200,"[[3.954326300889665], [2.9819398846726575]]"
16,SGD_momentum_linear,70,5,5,60,"[[4.066118245483852], [3.040655671479316]]"
18,SGD_momentum_linear,70,5,5,200,"[[3.9677829807520903], [3.0233134662978007]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Lambda,Beta
0,SGD_momentum_ridge,50,5,5,60,0.02,"[[3.8001749717783073], [3.057821261764638]]"
2,SGD_momentum_ridge,50,5,5,200,0.02,"[[3.958981517105635], [3.1223402627322345]]"
4,SGD_momentum_ridge,50,5,10,60,0.02,"[[3.7970761367706682], [3.017886281882475]]"
6,SGD_momentum_ridge,50,5,10,200,0.02,"[[3.842059797548236], [3.0696811497290253]]"
8,SGD_momentum_ridge,50,10,5,60,0.02,"[[3.8474010734979345], [3.0472610681695023]]"
10,SGD_momentum_ridge,50,10,5,200,0.02,"[[3.857852510918372], [3.0969891290211504]]"
12,SGD_momentum_ridge,50,10,10,60,0.02,"[[3.8033450265866997], [3.0477587794577317]]"
14,SGD_momentum_ridge,50,10,10,200,0.02,"[[3.829162770222205], [3.046147878224629]]"
16,SGD_momentum_ridge,70,5,5,60,0.02,"[[3.8362739947083844], [3.091950018734409]]"
18,SGD_momentum_ridge,70,5,5,200,0.02,"[[3.7726271043260082], [3.0297562948080117]]"


In [55]:
import pandas as pd

np.random.seed(1)
n = 2000
x = 2*np.random.rand(n,1)
y = 4+3*x+np.random.randn(n,1)

X = np.c_[np.ones((n,1)), x]

# Simulation Setup
learning_rates = [0.01, 0.02, 0.05]
M = [5, 10, 20]
num_epochs = [30,60,200]
ridge_hyperparameters = [0.01, 0.02, 0.03]  # Ridge hyperparameters to be tested

results_linear = []
results_ridge = []
results_sgd_linear = []
results_sgd_ridge = []
results_RMS_linear=[]
results_RMS_ridge=[]
Adam_linear_list=[]
Adam_ridge_list=[]
# Custom Optimization Functions
optimization_functions = [
    linear_GD, ridge_GD, linear_momentum_GD, ridge_momentum_GD, Adagrad_linear_momentum_SD, Adagrad_ridge_momentum_SD,
    Adagrad_linear_GD, Adagrad_ridge_GD, Adagrad_ridge_momentum_GD, Adagrad_linear_momentum_GD,
    Adagrad_linear_SD, Adagrad_ridge_SD, RMSprop_linear, RMSprop_ridge, Adam_ridge, Adam_linear
]

for func in optimization_functions:
    for learning_rate in learning_rates:
        for batch_size in M:
            for n_epochs in num_epochs:
                if "linear" in func.__name__.lower():
                    # Linear functions
                    degree = 2
                    for lmbda in ridge_hyperparameters:
                        beta = func(degree, X, y, n_epochs, batch_size, learning_rate, lmbda)
                        if "GD" in func.__name__.upper():
                            results_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        
                        # Check for SGD and split based on linear or ridge
                        if "SD" in func.__name__.upper():
                            results_sgd_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "RMSprop_linear" in func.__name__:
                            results_RMS_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "dam" in func.__name__:
                            Adam_linear_list.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))    
                elif "ridge" in func.__name__.lower():
                    # Ridge functions
                    degree = 2
                    for lmbda in ridge_hyperparameters:
                        beta = func(degree, X, y, n_epochs, batch_size, learning_rate, lmbda)
                        if "GD" in func.__name__.upper():
                            results_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        
                        # Check for SGD and split based on linear or ridge
                        if "SD" in func.__name__.upper():
                            results_sgd_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "RMSprop_ridge" in func.__name__:
                            results_RMS_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "dam" in func.__name__:
                            Adam_ridge_list.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))  

In [56]:


# Drop duplicates for each DataFrame
columns = ["Method", "Learning Rate", "Batch Size", "Epochs", "Lambda", "Beta"]
columns2 = ["Method", "Learning Rate", "Batch Size", "N_iter", "Lambda", "Beta"]
results_linear = pd.DataFrame(results_linear, columns=columns2)
results_ridge = pd.DataFrame(results_ridge, columns=columns2)
results_sgd_linear = pd.DataFrame(results_sgd_linear, columns=columns)
results_sgd_ridge = pd.DataFrame(results_sgd_ridge, columns=columns)

Adam_linear_list=pd.DataFrame(Adam_linear_list, columns=columns)
Adam_ridge_list=pd.DataFrame(Adam_ridge_list, columns=columns)
results_RMS_ridge=pd.DataFrame(results_RMS_ridge, columns=columns)
results_RMS_linear= pd.DataFrame(results_RMS_linear, columns=columns)


#results_sgd_ridge = results_sgd_ridge.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_ridge = results_ridge.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "N_iter"], keep="first")
results_sgd_linear = results_sgd_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_linear = results_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "N_iter"], keep="first")

Adam_linear_list = Adam_linear_list.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
Adam_ridge_list = Adam_ridge_list.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
#results_RMS_ridge = results_RMS_ridge_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_RMS_linear = results_RMS_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")


# Assuming 'results_linear' and 'results_ridge' are your DataFrames

# Drop the second column ('Learning Rate') from 'results_linear'
results_linear = results_linear.drop(columns=["Batch Size"])

# Drop the second column ('Learning Rate') from 'results_ridge'
results_ridge = results_ridge.drop(columns=["Batch Size"])

results_linear = results_linear.drop(columns=[ "Lambda"])

Adam_linear_list = Adam_linear_list.drop(columns=[ "Lambda"])
results_RMS_linear = results_RMS_linear.drop(columns=[ "Lambda"])

# Drop the second column ('Learning Rate') from 'results_ridge'
results_sgd_linear = results_sgd_linear.drop(columns=["Lambda"])
#pd.set_option('display.max_rows', None)
# Display unique DataFrames



In [57]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_sgd_ridge)


Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Lambda,Beta
0,Adagrad_ridge_momentum_SD,0.01,5,30,0.01,"[[3.7243027371496042], [3.2048344070862935]]"
1,Adagrad_ridge_momentum_SD,0.01,5,30,0.02,"[[3.8000287132897737], [3.1123102053638885]]"
2,Adagrad_ridge_momentum_SD,0.01,5,30,0.03,"[[3.664012011092439], [3.198563973362539]]"
3,Adagrad_ridge_momentum_SD,0.01,5,60,0.01,"[[3.7949317899324106], [3.14531660843942]]"
4,Adagrad_ridge_momentum_SD,0.01,5,60,0.02,"[[3.7400843788838416], [3.1628298864643365]]"
5,Adagrad_ridge_momentum_SD,0.01,5,60,0.03,"[[3.6917358859233995], [3.1751637442340392]]"
6,Adagrad_ridge_momentum_SD,0.01,5,200,0.01,"[[3.793349100902964], [3.1466497196068244]]"
7,Adagrad_ridge_momentum_SD,0.01,5,200,0.02,"[[3.740578608173491], [3.1624131975067358]]"
8,Adagrad_ridge_momentum_SD,0.01,5,200,0.03,"[[3.6923114084586213], [3.174678063905463]]"
9,Adagrad_ridge_momentum_SD,0.01,10,30,0.01,"[[3.739284756381836], [3.1851190640081413]]"


In [58]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_ridge)


Unnamed: 0,Method,Learning Rate,N_iter,Lambda,Beta
0,ridge_GD,0.01,30,0.01,"[[3.6039467790772566], [1.1802522463590552]]"
3,ridge_GD,0.01,60,0.01,"[[3.5511720406492233], [2.9394746525256115]]"
6,ridge_GD,0.01,200,0.01,"[[3.0973770913529304], [3.7281157447034734]]"
9,ridge_GD,0.01,30,0.01,"[[2.6076857752955767], [2.675161317859806]]"
12,ridge_GD,0.01,60,0.01,"[[3.8301126532024745], [2.725701161142399]]"
15,ridge_GD,0.01,200,0.01,"[[3.2549830145673586], [3.5971138221979704]]"
18,ridge_GD,0.01,30,0.01,"[[3.0111299547511905], [1.8622137013427065]]"
21,ridge_GD,0.01,60,0.01,"[[3.703254995726009], [2.7238793411436983]]"
24,ridge_GD,0.01,200,0.01,"[[3.161636545863621], [3.674759377877681]]"
27,ridge_GD,0.02,30,0.01,"[[2.3634210894380314], [3.965794009856616]]"


In [59]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_sgd_linear)


Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Beta
0,Adagrad_linear_momentum_SD,0.01,5,30,"[[3.7850897098630556], [3.1825721292038165]]"
3,Adagrad_linear_momentum_SD,0.01,5,60,"[[3.848593365513687], [3.129108857321472]]"
6,Adagrad_linear_momentum_SD,0.01,5,200,"[[3.8514998234689912], [3.1266629737461225]]"
9,Adagrad_linear_momentum_SD,0.01,10,30,"[[3.7623892646061448], [3.1940778981907973]]"
12,Adagrad_linear_momentum_SD,0.01,10,60,"[[3.7051868361610625], [3.2410814706345095]]"
15,Adagrad_linear_momentum_SD,0.01,10,200,"[[3.7051405001426865], [3.2411194482227534]]"
18,Adagrad_linear_momentum_SD,0.01,20,30,"[[3.9933668724580595], [3.005197312376201]]"
21,Adagrad_linear_momentum_SD,0.01,20,60,"[[3.8340561373719484], [3.1375402563750856]]"
24,Adagrad_linear_momentum_SD,0.01,20,200,"[[3.850380985019992], [3.1240020274285856]]"
27,Adagrad_linear_momentum_SD,0.02,5,30,"[[3.8482841626898843], [3.1284178924060737]]"


In [60]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_linear)

Unnamed: 0,Method,Learning Rate,N_iter,Beta
0,linear_GD,0.01,30,"[[1.4967084655883012], [3.4002864094999143]]"
3,linear_GD,0.01,60,"[[1.9983564858622405], [4.171906912733179]]"
6,linear_GD,0.01,200,"[[3.8953293104080977], [3.092270341824712]]"
9,linear_GD,0.01,30,"[[1.7661135450663128], [2.8975952245756567]]"
12,linear_GD,0.01,60,"[[3.914192510598586], [2.6810572972580777]]"
15,linear_GD,0.01,200,"[[3.578019543160852], [3.356310327416904]]"
18,linear_GD,0.01,30,"[[2.409068151891326], [2.6223434535091985]]"
21,linear_GD,0.01,60,"[[3.345014241907303], [3.393807000814135]]"
24,linear_GD,0.01,200,"[[3.8953152887418354], [3.092266647234594]]"
27,linear_GD,0.02,30,"[[3.522841318597587], [2.9302745873149507]]"


In [61]:

from IPython.display import display
pd.set_option('display.max_rows', None)
display(Adam_linear_list)


Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Beta
0,Adam_linear,0.01,5,30,"[[3.855482044825529], [3.0949414194912257]]"
3,Adam_linear,0.01,5,60,"[[3.8420603888445], [3.0829990608705837]]"
6,Adam_linear,0.01,5,200,"[[3.807815252683023], [3.0513875736214877]]"
9,Adam_linear,0.01,10,30,"[[3.8532480790620807], [3.1213809538449304]]"
12,Adam_linear,0.01,10,60,"[[3.8434615967522814], [3.1127653229677317]]"
15,Adam_linear,0.01,10,200,"[[3.815822679695934], [3.077680770965496]]"
18,Adam_linear,0.01,20,30,"[[3.892222477719393], [3.1127785123865355]]"
21,Adam_linear,0.01,20,60,"[[3.88451838805774], [3.1123211897957472]]"
24,Adam_linear,0.01,20,200,"[[3.856997986956328], [3.0968416791373365]]"
27,Adam_linear,0.02,5,30,"[[3.8189622498437332], [3.0614379107730554]]"


In [62]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(Adam_ridge_list)


Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Lambda,Beta
0,Adam_ridge,0.01,5,30,0.01,"[[3.7955738417531792], [3.1162049216597874]]"
3,Adam_ridge,0.01,5,60,0.01,"[[3.7820351739298688], [3.10436626576662]]"
6,Adam_ridge,0.01,5,200,0.01,"[[3.747288832196175], [3.07340357352215]]"
9,Adam_ridge,0.01,10,30,0.01,"[[3.794064114939862], [3.1420347385394125]]"
12,Adam_ridge,0.01,10,60,0.01,"[[3.784304071419007], [3.133351142322772]]"
15,Adam_ridge,0.01,10,200,0.01,"[[3.7565085968138963], [3.0984496995853656]]"
18,Adam_ridge,0.01,20,30,0.01,"[[3.831318254003926], [3.1347869560388384]]"
21,Adam_ridge,0.01,20,60,0.01,"[[3.823731504273365], [3.1341342180360625]]"
24,Adam_ridge,0.01,20,200,0.01,"[[3.7963961093213086], [3.1183656766338927]]"
27,Adam_ridge,0.02,5,30,0.01,"[[3.758633253307296], [3.083182783910883]]"


In [63]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_RMS_linear)


Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Beta
0,RMSprop_linear,0.01,5,30,"[[3.802085838032411], [3.0636220823621705]]"
3,RMSprop_linear,0.01,5,60,"[[3.8020858380324123], [3.0636220823621696]]"
6,RMSprop_linear,0.01,5,200,"[[3.802085838032411], [3.0636220823621705]]"
9,RMSprop_linear,0.01,10,30,"[[3.817693963351405], [3.07213861810337]]"
12,RMSprop_linear,0.01,10,60,"[[3.8176939633899534], [3.0721386180719064]]"
15,RMSprop_linear,0.01,10,200,"[[3.8176939633899534], [3.0721386180719064]]"
18,RMSprop_linear,0.01,20,30,"[[3.830344704839098], [3.0846423628904005]]"
21,RMSprop_linear,0.01,20,60,"[[3.8303481948090896], [3.0846395402483986]]"
24,RMSprop_linear,0.01,20,200,"[[3.8303481948090963], [3.0846395402483933]]"
27,RMSprop_linear,0.02,5,30,"[[3.7895312885063746], [3.0704885410534075]]"


In [64]:
from IPython.display import display
pd.set_option('display.max_rows', None)
display(results_RMS_ridge)

Unnamed: 0,Method,Learning Rate,Batch Size,Epochs,Lambda,Beta
0,RMSprop_ridge,0.01,5,30,0.01,"[[3.7408366655053795], [3.086486065310873]]"
1,RMSprop_ridge,0.01,5,30,0.02,"[[3.685198513888076], [3.105026523552427]]"
2,RMSprop_ridge,0.01,5,30,0.03,"[[3.634264311698274], [3.1199764569762225]]"
3,RMSprop_ridge,0.01,5,60,0.01,"[[3.7408366655053795], [3.086486065310873]]"
4,RMSprop_ridge,0.01,5,60,0.02,"[[3.6851985138880754], [3.1050265235524273]]"
5,RMSprop_ridge,0.01,5,60,0.03,"[[3.634264311698274], [3.1199764569762225]]"
6,RMSprop_ridge,0.01,5,200,0.01,"[[3.7408366655053795], [3.086486065310873]]"
7,RMSprop_ridge,0.01,5,200,0.02,"[[3.6851985138880754], [3.1050265235524273]]"
8,RMSprop_ridge,0.01,5,200,0.03,"[[3.634264311698274], [3.1199764569762225]]"
9,RMSprop_ridge,0.01,10,30,0.01,"[[3.7565511301165277], [3.0947031988887574]]"


Same calculation with autograd

In [1]:

from random import random, seed

import matplotlib.pyplot as plt
import numpy as np
import autograd.numpy as np

from autograd import grad


def CostOLS(X,y,theta):
    n=100
    return np.sum((y-X @ theta)**2)

def CostRidge(X,y,theta,lmbda):
    n=100
    return np.sum((y-X @ theta)**2)+lmbda*np.sum(beta**2)

training_gradient_OLS = grad(CostOLS,2)

training_gradient_ridge = grad(CostRidge,2)




def GD_linear(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    H = (2.0 / n) * X.T @ X
    EigValues, EigVectors = np.linalg.eig(H)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 1.0 / np.max(EigValues)

    for i in range(n_epochs):
        gradient = (1/n)*training_gradient_OLS(X,y,theta)
        theta -= learning_rate * gradient

    #print(beta)
    return theta



def GD_linear(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    H = (2.0 / n) * X.T @ X
    EigValues, EigVectors = np.linalg.eig(H)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 1.0 / np.max(EigValues)

    for i in range(n_epochs):
        gradient = (1/n)*training_gradient_OLS(X,y,theta)
        theta -= learning_rate * gradient

    #print(beta)
    return theta


def GD_ridge(degree, X, y, n_epochs,M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 0.1

    for i in range(n_epochs):
        gradients = (1/n)*training_gradient_ridge(X,y,theta,lmbda)
        theta -= learning_rate * gradients

    #print(beta)
    return theta

def GD_linear_momentum(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    H = (2.0 / n) * X.T @ X
    EigValues, EigVectors = np.linalg.eig(H)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 1.0 / np.max(EigValues)

    change = 0.0
    delta_momentum = 0.3

    for i in range(n_epochs):
        gradients = (1/n)*training_gradient_OLS(X,y,theta)
        
        new_change = learning_rate * gradients + delta_momentum * change
       
        
        theta -= new_change
        
        change = new_change

    #print(theta)
    return theta

def GD_ridge_momentum(degree, X, y, n_epochs,M, learning_rate,lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)

    if learning_rate is None:
        learning_rate = 0.1

    change = 0.0
    delta_momentum = 0.3

    for iter in range(n_epochs):
        gradients = (1/n)*training_gradient_ridge(X,y,theta,lmbda)
        #rint( lmbda)
        new_change = learning_rate * gradients + delta_momentum * change
        #rint(learning_rate)
        #print(delta_momentum)
        #rint(change)
        theta -= new_change
        change = new_change

    #print(theta)
    return theta



def SGD_linear(degree, X, y, n_epochs, M,  t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    def learning_schedule(t):
            return t0 / (t + t1)
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            learning_rate =learning_schedule(epoch * M + i) 
            gradients = (1/n)*training_gradient_OLS(X_mini,y_mini,theta)
            theta -= learning_rate * gradients

    return theta

def SGD_ridge(degree, X, y, n_epochs, M,  t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    def learning_schedule(t):
            return t0 / (t + t1)
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            learning_rate =learning_schedule(epoch * M + i) 
            gradients = (1/n)*training_gradient_ridge(X_mini,y_mini,theta,lmbda)
            theta -= learning_rate * gradients

    return theta





def SGD_momentum_linear(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    def learning_schedule(t):
        return t0 / (t + t1)
    

    change = 0.0
    delta_momentum = 0.03

   
        
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            gradients = (1/n)*training_gradient_OLS(X_mini,y_mini,theta)
            eta=learning_schedule(epoch * M + i) 
            new_change = eta * gradients + delta_momentum * change
            theta = theta - new_change
            change = new_change

    return theta




def SGD_momentum_ridge(degree, X, y, n_epochs, M, t0,t1, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
  
    delta_momentum = 0.3
    
    change = 0.0
    delta_momentum = 0.03
    delta = 1e-8

    def learning_schedule(t):
            return t0 / (t + t1)
        
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[indices[i:i + M]]
            y_mini = y[indices[i:i + M]]
            gradients = (1/n)*training_gradient_ridge(X_mini,y_mini,theta,lmbda)
            eta=learning_schedule(epoch * M + i)
            new_change = eta * gradients + delta_momentum * change
            theta = theta - new_change
            change = new_change

    return theta



def Adagrad_linear_momentum_SD(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    delta_momentum = 0.03
    change = 0.0
    t0, t1 = 5, 50
    delta = 1e-8
    Giter = 0.0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n)
        Giter=0
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_OLS(X_mini,y_mini,theta)
            Giter += gradients * gradients
            eta = learning_rate
            new_change = gradients * eta / (delta + np.sqrt(Giter)) + delta_momentum * change
            theta -= new_change
            change = new_change

    return theta

def Adagrad_ridge_momentum_SD(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    delta_momentum = 0.03
    change = 0.0
    t0, t1 = 5, 50
    delta = 1e-8
    Giter = 0.0
    
    for epoch in range(n_epochs):
        Giter=0
        indices = np.random.permutation(n)
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_ridge(X_mini,y_mini,theta,lmbda)
            Giter += gradients * gradients
            eta = learning_rate
            new_change = gradients * eta / (delta + np.sqrt(Giter)) + delta_momentum * change
            theta -= new_change
            change = new_change

    return theta








def Adagrad_ridge_SD(degree,X, y, n_iterations,m,learning_rate, lmbda ):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        G=0
        for i in range(m):
            random_index = np.random.randint(m)
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = (1/n)*training_gradient_ridge(xi,yi,theta,lmbda)
            G += gradients ** 2
            theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
    
    return theta


def Adagrad_linear_SD(degree,X, y, n_iterations,m,learning_rate, lmbda ):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        G=0
        for i in range(m):
            random_index = np.random.randint(m)
            xi = X[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = (1/n)*training_gradient_OLS(xi,yi,theta)
            G += gradients ** 2
            theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
    
    return theta

def Adagrad_linear_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    
    for iteration in range(n_iterations):
        gradients = (1/n)*training_gradient_OLS(X,y,theta)
        G += gradients ** 2
        theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
        G=0.0
    return theta


def Adagrad_ridge_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    epsilon=1e-8
    for iteration in range(n_iterations):
        gradients = (1/n)*training_gradient_ridge(X,y,theta,lmbda)
        G += gradients ** 2
        theta -= learning_rate * gradients / (np.sqrt(G) + epsilon)
        G=0
    return theta




def Adagrad_linear_momentum_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    change = 0.0
    delta = 1e-8
    delta_momentum = 0.03
    
    for iteration in range(n_iterations):
        gradients = (1/n)*training_gradient_OLS(X,y,theta)
        G += gradients * gradients
        eta = learning_rate
        new_change = eta / (delta + np.sqrt(G)) * gradients + delta_momentum * change
        theta -= new_change
        change = new_change
        G=0
    return theta

def Adagrad_ridge_momentum_GD(degree, X, y, n_iterations, M, learning_rate, lmbda):
    n, m = X.shape
    theta = np.random.randn(degree, 1)
    G = 0.0
    change = 0.0
    delta = 1e-8
    delta_momentum = 0.03
    
    for iteration in range(n_iterations):
        gradients = (1/n)*training_gradient_ridge(X,y,theta,lmbda)
        G += gradients * gradients
        eta = learning_rate
        new_change = eta / (delta + np.sqrt(G)) * gradients + delta_momentum * change
        theta -= new_change
        change = new_change
        G=0
    return theta



def RMSprop_linear(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    G_squared = 0.0
    delta = 1e-8
    rho = 0.99
    eta = learning_rate
    for epoch in range(n_epochs):
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_OLS(X_mini,y_mini,theta)
            G_squared = rho * G_squared + (1 - rho) * gradients * gradients
            update = gradients * learning_rate / (np.sqrt(G_squared) + delta)
            theta -= update

    return theta

def RMSprop_ridge(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    G_squared = 0.0
    delta = 1e-8
    rho = 0.99
    eta = learning_rate
    for epoch in range(n_epochs):
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_ridge(X_mini,y_mini,theta,lmbda)
            G_squared = rho * G_squared + (1 - rho) * gradients * gradients
            update = gradients * learning_rate / (np.sqrt(G_squared) + delta)
            theta -= update

    return theta

def Adam_ridge(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    iter=0.0
    delta = 1e-7
    beta1 = 0.9
    beta2 = 0.999

    for epoch in range(n_epochs):
        first_moment = 0.0
        second_moment = 0.0
        iter+=1
        
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_ridge(X_mini,y_mini,theta,lmbda)
            first_moment = beta1 * first_moment + (1 - beta1) * gradients
            second_moment = beta2 * second_moment + (1 - beta2) * gradients * gradients
            first_bias_corrected = first_moment / (1 - beta1 ** (iter))
            second_bias_corrected = second_moment / (1 - beta2 ** (iter))
            update = first_bias_corrected * learning_rate / (np.sqrt(second_bias_corrected) + delta)
            theta -= update

    return theta

def Adam_linear(degree, X, y, n_epochs, M, learning_rate, lmbda):
    n = len(X)
    theta = np.random.randn(degree, 1)
    iter=0.0
    delta = 1e-7
    beta1 = 0.9
    beta2 = 0.999

    for epoch in range(n_epochs):
        first_moment = 0.0
        second_moment = 0.0
        iter+=1
        
        for i in range(0, n, M):
            X_mini = X[i:i + M]
            y_mini = y[i:i + M]
            gradients = (1/n)*training_gradient_OLS(X_mini,y_mini,theta)
            first_moment = beta1 * first_moment + (1 - beta1) * gradients
            second_moment = beta2 * second_moment + (1 - beta2) * gradients * gradients
            first_bias_corrected = first_moment / (1 - beta1 ** (iter))
            second_bias_corrected = second_moment / (1 - beta2 ** (iter))
            update = first_bias_corrected * learning_rate / (np.sqrt(second_bias_corrected) + delta)
            theta -= update

    return theta




In [2]:
import pandas as pd

np.random.seed(0)
n = 2000
x = 2 * np.random.rand(n, 1)
y = 4 + 3 * x + np.random.randn(n, 1)

X = np.c_[np.ones((n, 1)), x]

t_1_values = [50, 70]  # t_1 values to be tested
t_0_values = [5, 10]  # t_0 values to be tested

M = [5, 10]
num_epochs = [ 60, 200]
ridge_hyperparameters = [ 0.02, 0.03]
results_sgd_linear = []
results_sgd_ridge = []
results_sgd_momentum_linear = []
results_sgd_momentum_ridge = []

# Custom Optimization Functions
optimization_functions = [
    SGD_linear, SGD_ridge, SGD_momentum_linear, SGD_momentum_ridge
]

for func in optimization_functions:
    for t_1 in t_1_values:
        for t_0 in t_0_values:
            for batch_size in M:
                for n_epochs in num_epochs:
                    print("here")
                    if "SGD_linear" in func.__name__:
                        # Linear functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_linear.append((func.__name__, t_1, t_0,batch_size, n_epochs, lmbda, beta))
                    elif "SGD_ridge" in func.__name__:
                        # Ridge functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_ridge.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))
                    elif "SGD_momentum_linear" in func.__name__:
                        # Momentum Linear functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_momentum_linear.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))
                    elif "SGD_momentum_ridge" in func.__name__:
                        # Momentum Ridge functions
                        degree = 2
                        for lmbda in ridge_hyperparameters:
                            beta = func(degree, X, y, n_epochs,batch_size, t_1, t_0, lmbda)
                            results_sgd_momentum_ridge.append((func.__name__, t_1, t_0, batch_size,n_epochs, lmbda, beta))

# Convert results to dataframes
columns = ["Method", "t_1", "t_0", "Batch Size", "Epochs", "Lambda", "Beta"]
df_sgd_linear = pd.DataFrame(results_sgd_linear, columns=columns)
df_sgd_ridge = pd.DataFrame(results_sgd_ridge, columns=columns)
df_sgd_momentum_linear = pd.DataFrame(results_sgd_momentum_linear, columns=columns)
df_sgd_momentum_ridge = pd.DataFrame(results_sgd_momentum_ridge, columns=columns)


df_sgd_linear = df_sgd_linear.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_ridge = df_sgd_ridge.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_momentum_linear= df_sgd_momentum_linear.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")
df_sgd_momentum_ridge = df_sgd_momentum_ridge.drop_duplicates(subset=["Method", "t_1", "t_0", "Batch Size", "Epochs"], keep="first")


df_sgd_linear = df_sgd_linear.drop(columns=[ "Lambda"])

# Drop the second column ('Learning Rate') from 'results_ridge'
df_sgd_momentum_linear = df_sgd_momentum_linear.drop(columns=["Lambda"])

pd.set_option('display.max_rows', None)

from IPython.display import display
display(df_sgd_linear)
display(df_sgd_ridge)
display(df_sgd_momentum_linear)
display(df_sgd_momentum_ridge)



here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Beta
0,SGD_linear,50,5,5,60,"[[3.5286267150244304], [3.3645893991048488]]"
2,SGD_linear,50,5,5,200,"[[3.9361186780164044], [3.025560182429481]]"
4,SGD_linear,50,5,10,60,"[[3.6755696930846233], [3.242013863127834]]"
6,SGD_linear,50,5,10,200,"[[4.027768582462888], [2.947843020655001]]"
8,SGD_linear,50,10,5,60,"[[3.735982088104996], [3.1916941971946935]]"
10,SGD_linear,50,10,5,200,"[[3.9816214281069877], [2.9877925353091066]]"
12,SGD_linear,50,10,10,60,"[[3.4112410588182116], [3.4610769495076146]]"
14,SGD_linear,50,10,10,200,"[[3.917172287194556], [3.0412603252431105]]"
16,SGD_linear,70,5,5,60,"[[3.80501172923558], [3.136364817788343]]"
18,SGD_linear,70,5,5,200,"[[3.9506755326086185], [3.0133687795106097]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Lambda,Beta
0,SGD_ridge,50,5,5,60,0.02,"[[3.676522929089851], [3.2401828037158165]]"
2,SGD_ridge,50,5,5,200,0.02,"[[3.9838058989903593], [2.985783198970678]]"
4,SGD_ridge,50,5,10,60,0.02,"[[3.7484273985012058], [3.1831723948023063]]"
6,SGD_ridge,50,5,10,200,0.02,"[[3.863259331519997], [3.084939173175114]]"
8,SGD_ridge,50,10,5,60,0.02,"[[3.805353524107013], [3.140674875547053]]"
10,SGD_ridge,50,10,5,200,0.02,"[[3.8537573826378018], [3.0938090268425236]]"
12,SGD_ridge,50,10,10,60,0.02,"[[3.891838691871], [3.06312272417665]]"
14,SGD_ridge,50,10,10,200,0.02,"[[3.792043318395721], [3.145851296342041]]"
16,SGD_ridge,70,5,5,60,0.02,"[[4.071144203920451], [2.9129672614412003]]"
18,SGD_ridge,70,5,5,200,0.02,"[[3.9429682317872143], [3.020151198504215]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Beta
0,SGD_momentum_linear,50,5,5,60,"[[3.6940814032481493], [3.2249768404735994]]"
2,SGD_momentum_linear,50,5,5,200,"[[3.851757915735525], [3.0953486978625278]]"
4,SGD_momentum_linear,50,5,10,60,"[[3.561865262509753], [3.338861218708879]]"
6,SGD_momentum_linear,50,5,10,200,"[[3.922643654432542], [3.0349126430589797]]"
8,SGD_momentum_linear,50,10,5,60,"[[3.463185385572309], [3.425012802440442]]"
10,SGD_momentum_linear,50,10,5,200,"[[3.9533749877757822], [3.0089582328716618]]"
12,SGD_momentum_linear,50,10,10,60,"[[3.4403375045510125], [3.4387994655534477]]"
14,SGD_momentum_linear,50,10,10,200,"[[3.858786922099457], [3.089614035497538]]"
16,SGD_momentum_linear,70,5,5,60,"[[3.790309650487832], [3.141349567164859]]"
18,SGD_momentum_linear,70,5,5,200,"[[3.946215375619241], [3.0141205971482647]]"


Unnamed: 0,Method,t_1,t_0,Batch Size,Epochs,Lambda,Beta
0,SGD_momentum_ridge,50,5,5,60,0.02,"[[3.9829273858191225], [2.986670794332172]]"
2,SGD_momentum_ridge,50,5,5,200,0.02,"[[3.9327409622510863], [3.0254735224846248]]"
4,SGD_momentum_ridge,50,5,10,60,0.02,"[[4.086391258393624], [2.9008301180531353]]"
6,SGD_momentum_ridge,50,5,10,200,0.02,"[[3.8480354962096515], [3.0979120765624657]]"
8,SGD_momentum_ridge,50,10,5,60,0.02,"[[3.896133839545473], [3.0596930315919213]]"
10,SGD_momentum_ridge,50,10,5,200,0.02,"[[3.8909332342076377], [3.0610852620926874]]"
12,SGD_momentum_ridge,50,10,10,60,0.02,"[[3.793445695211112], [3.14820278617518]]"
14,SGD_momentum_ridge,50,10,10,200,0.02,"[[3.8338454869129497], [3.1116193136579984]]"
16,SGD_momentum_ridge,70,5,5,60,0.02,"[[3.8816631974101283], [3.071624591103751]]"
18,SGD_momentum_ridge,70,5,5,200,0.02,"[[3.9475781352052275], [3.015958262923281]]"


In [None]:
import pandas as pd

np.random.seed(1)
n = 2000
x = 2*np.random.rand(n,1)
y = 4+3*x+np.random.randn(n,1)

X = np.c_[np.ones((n,1)), x]


learning_rates = [0.01, 0.02, 0.05]
M = [5, 10, 20]
num_epochs = [30,60,200]
ridge_hyperparameters = [0.01, 0.02, 0.03]  # Ridge hyperparameters to be tested

results_linear = []
results_ridge = []
results_sgd_linear = []
results_sgd_ridge = []
results_RMS_linear=[]
results_RMS_ridge=[]
Adam_linear_list=[]
Adam_ridge_list=[]
# Custom Optimization Functions
optimization_functions = [
    linear_GD, ridge_GD, linear_momentum_GD, ridge_momentum_GD, Adagrad_linear_momentum_SD, Adagrad_ridge_momentum_SD,
    Adagrad_linear_GD, Adagrad_ridge_GD, Adagrad_ridge_momentum_GD, Adagrad_linear_momentum_GD,
    Adagrad_linear_SD, Adagrad_ridge_SD, RMSprop_linear, RMSprop_ridge, Adam_ridge, Adam_linear
]

for func in optimization_functions:
    for learning_rate in learning_rates:
        for batch_size in M:
            for n_epochs in num_epochs:
                if "linear" in func.__name__.lower():
                    # Linear functions
                    degree = 2
                    for lmbda in ridge_hyperparameters:
                        beta = func(degree, X, y, n_epochs, batch_size, learning_rate, lmbda)
                        if "GD" in func.__name__.upper():
                            results_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        
                        # Check for SGD and split based on linear or ridge
                        if "SD" in func.__name__.upper():
                            results_sgd_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "RMSprop_linear" in func.__name__:
                            results_RMS_linear.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "dam" in func.__name__:
                            Adam_linear_list.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))    
                elif "ridge" in func.__name__.lower():
                    # Ridge functions
                    degree = 2
                    for lmbda in ridge_hyperparameters:
                        beta = func(degree, X, y, n_epochs, batch_size, learning_rate, lmbda)
                        if "GD" in func.__name__.upper():
                            results_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        
                        # Check for SGD and split based on linear or ridge
                        if "SD" in func.__name__.upper():
                            results_sgd_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "RMSprop_ridge" in func.__name__:
                            results_RMS_ridge.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))
                        if "dam" in func.__name__:
                            Adam_ridge_list.append((func.__name__, learning_rate, batch_size, n_epochs, lmbda, beta))  

In [None]:
# Drop duplicates for each DataFrame
columns = ["Method", "Learning Rate", "Batch Size", "Epochs", "Lambda", "Beta"]
columns2 = ["Method", "Learning Rate", "Batch Size", "N_iter", "Lambda", "Beta"]
results_linear = pd.DataFrame(results_linear, columns=columns2)
results_ridge = pd.DataFrame(results_ridge, columns=columns2)
results_sgd_linear = pd.DataFrame(results_sgd_linear, columns=columns)
results_sgd_ridge = pd.DataFrame(results_sgd_ridge, columns=columns)

Adam_linear_list=pd.DataFrame(Adam_linear_list, columns=columns)
Adam_ridge_list=pd.DataFrame(Adam_ridge_list, columns=columns)
results_RMS_ridge=pd.DataFrame(results_RMS_ridge, columns=columns)
results_RMS_linear= pd.DataFrame(results_RMS_linear, columns=columns)


#results_sgd_ridge = results_sgd_ridge.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_ridge = results_ridge.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "N_iter"], keep="first")
results_sgd_linear = results_sgd_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_linear = results_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "N_iter"], keep="first")

Adam_linear_list = Adam_linear_list.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
Adam_ridge_list = Adam_ridge_list.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
#results_RMS_ridge = results_RMS_ridge_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")
results_RMS_linear = results_RMS_linear.drop_duplicates(subset=["Method", "Learning Rate", "Batch Size", "Epochs"], keep="first")


# Assuming 'results_linear' and 'results_ridge' are your DataFrames

# Drop the second column ('Learning Rate') from 'results_linear'
results_linear = results_linear.drop(columns=["Batch Size"])

# Drop the second column ('Learning Rate') from 'results_ridge'
results_ridge = results_ridge.drop(columns=["Batch Size"])

results_linear = results_linear.drop(columns=[ "Lambda"])

Adam_linear_list = Adam_linear_list.drop(columns=[ "Lambda"])
results_RMS_linear = results_RMS_linear.drop(columns=[ "Lambda"])

# Drop the second column ('Learning Rate') from 'results_ridge'
results_sgd_linear = results_sgd_linear.drop(columns=["Lambda"])
#pd.set_option('display.max_rows', None)
# Display unique DataFrames

