In [19]:
#implemente um gradiente descendente em batch com uma early stopping para a regressao softmax


In [20]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [21]:
X, y = load_iris(return_X_y=True)
X = np.c_[np.ones(X.shape[0]), X] #adiciona o bias

In [22]:
print(X.shape, y.shape)

(150, 5) (150,)


Dividindo o dataset


In [23]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X)

In [24]:
test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

In [25]:
rnd_indices = np.random.permutation(total_size)

In [26]:
X_train = X[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]    

Codificando classes com one hot

In [27]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot


Softmax

In [28]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums


In [29]:
to_one_hot(y_train[:10])

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [30]:
Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [31]:
n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)

Treinamento do modelo em gradiente descendente em batch 

In [32]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    if iteration % 500 == 0:
        loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1)) #cross entropy
        print(iteration, loss)
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error)
    Theta = Theta - eta * gradients


0 2.431519476803927
500 0.44837749137687255
1000 0.3705941703054912
1500 0.32266942552781624
2000 0.2882508891858654
2500 0.26204946829155523
3000 0.2413867984913167
3500 0.22466210843504622
4000 0.21084076414491182
4500 0.1992209891361939
5000 0.18930961738835633


Regularização L2

In [33]:
eta = 0.1
alpha = 0.1

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    if iteration % 500 == 0:
        print(iteration, loss)
    gradients = 1/m * X_train.T.dot(Y_proba - Y_train_one_hot) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients


0 1.5506890998806169
500 0.6921853517031169
1000 0.6532774745080747
1500 0.6214677719864803
2000 0.595345550173652
2500 0.5738302753367684
3000 0.5560934126388386
3500 0.5414832958027516
4000 0.5294701145387375
4500 0.5196122688325698
5000 0.5115373804574674


Early Stopping

In [34]:
best_loss = np.infty

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    gradients = 1/m * X_train.T.dot(Y_proba - Y_train_one_hot) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

    logits = X_valid.dot(Theta)
    Y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(Y_valid_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    if loss < best_loss:
        best_loss = loss
    else:
        print(iteration - 1, best_loss)
        print(iteration, loss, "early stopping!")
        break


1 0.4383268543162813
2 0.5474882081146519 early stopping!


Avaliando modelo

In [35]:
logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
print(accuracy_score)


0.7333333333333333
