In [None]:
import numpy as np
from sklearn.datasets import load_svmlight_file
import matplotlib.pyplot as plt
import random


# define objective function
def sigmoid_function(x):
    return 1 / (1 + np.exp(-x))


# define calculate gradient function
def calculate_gradient(given_x, given_y, omega):
    hw = sigmoid_function(np.dot(given_x, omega))
    a = hw - given_y
    gradient_ = np.dot(given_x.T, a) / given_y.size
    return gradient_


# define loss function
def loss_function(given_x, given_y, omega):
    hw = sigmoid_function(np.dot(given_x, omega))
    loss = -np.mean(given_y * np.log(hw) + (1 - given_y) * np.log(1 - hw))
    return loss


# load training data set
X_train, y_train = load_svmlight_file("a9a_train")
X_train = X_train.toarray()
X_train = np.column_stack((X_train, np.ones(X_train.shape[0])))
y_train = y_train + np.ones(y_train.size)
y_train = y_train / 2

# load validation data set
X_test, y_test = load_svmlight_file("a9a_test.t")
X_test = X_test.toarray()
X_test = np.column_stack((X_test, np.zeros(X_test.shape[0])))
X_test = np.column_stack((X_test, np.ones(X_test.shape[0])))
y_test = y_test + np.ones(y_test.size)
y_test = y_test / 2

# parameters initialize - zeros
SGD_theta = np.zeros(124)
NAG_theta = np.zeros(124)
RMSProp_theta = np.zeros(124)
AdaDelta_theta = np.zeros(124)
Adam_theta = np.zeros(124)

# define learning rate
learning_rate = 0.01

# define iteration number
iteration_num = 1000
# define batch number
batch_number = 100

# loss
SGD_loss = []
NAG_loss = []
RMSProp_loss = []
Adadelta_loss = []
Adam_loss = []
# parameters for NAG
Alpha=0.9
v_theta=0
# parameters for RMSPorp
r_theta=0
decay_rate=0.9
delta=0.00000001
# parameters for Adadelta
o_theta=0
s_theta=learning_rate
Delta_theta=0
# parameters for Adam
decay_rate2=0.999
rho_1=0
rho_2=0
for i in range(iteration_num):

    index = random.randint(0, y_train.size - batch_number)

    SGD_gradient = calculate_gradient(X_train[index:index + batch_number], y_train[index:index + batch_number], SGD_theta)

    # update parameters
    SGD_theta = SGD_theta - learning_rate * SGD_gradient

    SGD_loss.append(loss_function(X_test, y_test, SGD_theta))

    # NAG
    NAG_gradient=calculate_gradient(X_train[index:index + batch_number], y_train[index:index + batch_number],NAG_theta+Alpha*v_theta)
    v_theta=Alpha*v_theta- learning_rate * NAG_gradient
    NAG_theta+=v_theta
    NAG_loss.append(loss_function(X_test, y_test, NAG_theta))

    # RMSProp
    RMSProp_gradient=calculate_gradient(X_train[index:index + batch_number], y_train[index:index + batch_number],RMSProp_theta)
    r_theta=decay_rate*r_theta+(1-decay_rate)*(RMSProp_gradient**2)
    RMSProp_theta = RMSProp_theta-learning_rate*RMSProp_gradient/(np.sqrt(r_theta+delta))
    RMSProp_loss.append(loss_function(X_test, y_test, RMSProp_theta))

    # Adadelta
    Adadelta_gradient=calculate_gradient(X_train[index:index + batch_number], y_train[index:index + batch_number],AdaDelta_theta)
    o_theta = decay_rate * o_theta + (1 - decay_rate) * (Adadelta_gradient ** 2)
    if i != 0:
        s_theta = decay_rate * s_theta + (1 - decay_rate) * Delta_theta ** 2

    Delta_theta = np.sqrt(s_theta) / (np.sqrt(o_theta + delta)) * Adadelta_gradient
    AdaDelta_theta-=Delta_theta
    Adadelta_loss.append(loss_function(X_test, y_test, AdaDelta_theta))

    # Adam
    Adam_gradient=calculate_gradient(X_train[index:index + batch_number], y_train[index:index + batch_number],Adam_theta)
    rho_1=decay_rate*rho_1+(1-decay_rate)*Adam_gradient
    rho_2=decay_rate2*rho_2+(1-decay_rate2)*(Adam_gradient**2)
    rho_1_ = rho_1 / (1 - decay_rate ** (i + 1))
    rho_2_ = rho_2/(1-decay_rate2**(i+1))
    Adam_theta=Adam_theta-rho_1_*learning_rate/(np.sqrt(rho_2_+delta))
    Adam_loss.append(loss_function(X_test, y_test, Adam_theta))
# plot loss changes with iteration
# yellow line for validation data loss
plt.plot(Adam_loss,label="Adam")
plt.plot(Adadelta_loss,label="AdaDelta")
plt.plot(SGD_loss,label="SGD")
plt.plot(NAG_loss,label="NAG")
plt.plot(RMSProp_loss,label="RMSProp")
plt.legend(loc='upper right')
plt.xlabel("Iteration number")
plt.ylabel("Loss")
plt.title("Linear Regression and gradient decent")
plt.yscale('log')
plt.show()