In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
iris = load_iris()
X, y = iris.data, iris.target

In [3]:
X = X / np.max(X)

In [4]:
encoder = OneHotEncoder()
y = encoder.fit_transform(y.reshape(-1, 1)).toarray()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [9]:
def categorical_cross_entropy_loss(y_true, y_pred):
 # avoid numerical instability by adding a small constant to log
 eps = 1e-15
 y_pred = np.clip(y_pred, eps, 1 - eps)
 loss = -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
 return loss

In [10]:
def derivative_categorical_cross_entropy_loss(y_true, y_pred):
 return (y_pred - y_true) / y_true.shape[0]

In [165]:
def softmax(x):
 return np.exp(x) / np.sum(np.exp(x))

In [163]:
def derivative_softmax(x):
 return x * (1 - x)

In [152]:
def relu(x):
 return np.maximum(0, x)

In [159]:
def derivative_relu(x):
  x_copy = np.copy(x)
  for index in range(len(x_copy[0])):
    if x_copy[0][index] > 0:
      x_copy[0][index] = 1
  return x_copy

In [15]:
def L1_reg(lambda_, W1, W2):
 return lambda_ * (np.sum(np.abs(W1)) + np.sum(np.abs(W2)))

In [16]:
def derivative_L1_reg(lambda_, W):
 return lambda_ * np.sign(W)

In [17]:
def L2_reg(lambda_, W1, W2):
 return lambda_ * (np.sum(np.square(W1)) + np.sum(np.square(W2)))

In [18]:
def derivative_L2_reg(lambda_, W):
 return lambda_ * 2 * W

In [167]:
def stochastic_gradient_descent(X, y, W1, b1, W2, b2, learning_rate, batch_size, l1_lambda, l2_lambda):
  loss = 0.0
  for x_el, y_el in zip(X, y):
    sum1 = np.dot(x_el, W1) + b1
    relu1 = relu(sum1)
    sum2 = np.dot(relu1, W2) + b2
    y_pred = softmax(sum2)

    L1 = np.sum(np.square(W2))
    loss += categorical_cross_entropy_loss(y_el, y_pred) + L1_reg(l1_lambda, W1, W2) + L2_reg(l2_lambda, W1, W2)

    loss_w2 = relu1.T * (derivative_categorical_cross_entropy_loss(y_el, y_pred) * derivative_softmax(y_pred)) + derivative_L1_reg(l2_lambda, W2) + derivative_L2_reg(l2_lambda, W2)
    loss_b2 = derivative_categorical_cross_entropy_loss(y_el, y_pred) * derivative_softmax(y_pred)

    W2 -= learning_rate * loss_w2
    b2 -= learning_rate * loss_b2 # without regularization

    loss_w1 = x_el.reshape(4, 1) * np.sum(W2 * (derivative_categorical_cross_entropy_loss(y_el, y_pred) * derivative_softmax(y_pred)), axis=1).reshape(1, 32) * derivative_relu(relu1) + derivative_L1_reg(l1_lambda, W1) +  + derivative_L2_reg(l1_lambda, W1)
    loss_b1 = np.sum(W2 * (derivative_categorical_cross_entropy_loss(y_el, y_pred) * derivative_softmax(y_pred)), axis=1).reshape(1, 32) * derivative_relu(relu1)

    W1 -= learning_rate * loss_w1
    b1 -= learning_rate * loss_b1 # without regularization

  loss /= len(y)
  return W1, b1, W2, b2, loss

In [144]:
def train(X, y, num_epochs, learning_rate, batch_size, l1_lambda, l2_lambda):
  input_size = X_train.shape[1] # input layer size
  hidden_size = 32 # hidden layer size
  output_size = y_train.shape[1] # output layer size

  W1 = np.random.randn(input_size, hidden_size) * 0.01
  b1 = np.zeros((1, hidden_size))
  W2 = np.random.randn(hidden_size, output_size) * 0.01
  b2 = np.zeros((1, output_size))

  for i in range(num_epochs):
    W1, b1, W2, b2, loss = stochastic_gradient_descent(X, y, W1, b1, W2, b2, learning_rate, batch_size, l1_lambda, l2_lambda)
    print('Epoch ' + str(i + 1) + ' Loss: ' + str(loss))
  return W1, b1, W2, b2

In [168]:
learning_rate = 0.1
num_epochs = 1000
batch_size = 16
l1_lambda = 0.001
l2_lambda = 0.001

W1, b1, W2, b2 = train(X_train, y_train, num_epochs, learning_rate, batch_size, l1_lambda, l2_lambda)

Epoch 1 Loss: 0.36809905026910344
Epoch 2 Loss: 0.367112086662052
Epoch 3 Loss: 0.36682341607697055
Epoch 4 Loss: 0.366751811272483
Epoch 5 Loss: 0.3667206449281527
Epoch 6 Loss: 0.3667007996019491
Epoch 7 Loss: 0.3666871500129614
Epoch 8 Loss: 0.3666776631081533
Epoch 9 Loss: 0.3666709964718058
Epoch 10 Loss: 0.36666625838510314
Epoch 11 Loss: 0.36666285107050944
Epoch 12 Loss: 0.36666037118740535
Epoch 13 Loss: 0.36665854645223006
Epoch 14 Loss: 0.36665718804856223
Epoch 15 Loss: 0.36665616525632677
Epoch 16 Loss: 0.3666553876120174
Epoch 17 Loss: 0.36665479132246254
Epoch 18 Loss: 0.36665432991214203
Epoch 19 Loss: 0.36665397043087433
Epoch 20 Loss: 0.36665368835432705
Epoch 21 Loss: 0.3666534660084892
Epoch 22 Loss: 0.36665328967270844
Epoch 23 Loss: 0.36665314930349313
Epoch 24 Loss: 0.3666530369223604
Epoch 25 Loss: 0.3666529471847328
Epoch 26 Loss: 0.36665287595347823
Epoch 27 Loss: 0.3666528180731596
Epoch 28 Loss: 0.3666527714534747
Epoch 29 Loss: 0.3666527338362996
Epoch 30 L

In [171]:
error = 0.0
for x_el, y_el in zip(X_test, y_test):
  sum1 = np.dot(x_el, W1) + b1
  relu1 = relu(sum1)
  sum2 = np.dot(relu1, W2) + b2
  y_pred = softmax(sum2)
  error += categorical_cross_entropy_loss(y_el, y_pred)
error /= len(np.array(y_test))
print('Error over test set: ' + str(error))

Error over test set: 0.36554019338627547
