In [1]:
import numpy as np
from pydataset import data
from pandas import get_dummies
from sklearn.model_selection import train_test_split

In [2]:
iris = get_dummies(data("iris"))
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
1,5.1,3.5,1.4,0.2,1,0,0
2,4.9,3.0,1.4,0.2,1,0,0
3,4.7,3.2,1.3,0.2,1,0,0
4,4.6,3.1,1.5,0.2,1,0,0
5,5.0,3.6,1.4,0.2,1,0,0


In [3]:
iris.shape

(150, 7)

In [4]:
X, y = iris.iloc[:,:4].values, iris.iloc[:,4:].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)
X_train = X_train.T
y_train = y_train.T
X_test = X_test.T
y_test = y_test.T

In [10]:
# Matrix of size nfeaturesXnclasses to hold the parameters
# of each class
nfeatures = X_train.shape[0]
nclasses = y_train.shape[0]
theta = np.zeros((nfeatures, nclasses))

In [14]:
X_train.shape

(4, 120)

In [8]:
y_train.shape

(3, 120)

In [9]:
theta.shape

(4, 3)

In [36]:
# Matrix of size nfeatures X nexamples containing the
# numerator of sigmoid-softmax function
Svals = np.exp(theta.T @ X_train)
# Column vector with cnexamples elements containing the normalization
# factor for the sigmoid-softmax activation function
onesvect = np.ones((1, nclasses))
Svalstot = onesvect @ Svals
Svalstot.shape

(1, 120)

In [41]:
X_train.shape

(4, 120)

In [37]:
# The probability for each example to belong to a class 'k'.
# Each row corresponds to a class and each column to a given
# training example
sigsoftk = Svals / Svalstot

In [46]:
sigsoftk.shape

(3, 120)

In [70]:
-(y_train * np.log(sigsoftk)).sum() / X_train.shape[1]

1.0986122886681098

In [72]:
y_train.shape

(3, 120)

In [77]:
X_train.shape

(4, 120)

In [95]:
- (y_train * (1 - sigsoftk))[0,0] * X_train[:,0] 

array([-3.33333333, -2.26666667, -1.        , -0.13333333])

### Grad Test 

In [176]:
for k in range(y_train.shape[0]):
    grad = ((sigsoftk - y_train)[k,:] * X_train).sum(axis=1) / X_train.shape[1]
    print(f"Grad for class {k}: {grad}")

Grad for class 0: [ 0.31222222 -0.09805556  0.77111111  0.31555556]
Grad for class 1: [-0.21611111  0.00611111 -0.30555556 -0.08527778]
Grad for class 2: [-0.09611111  0.09194444 -0.46555556 -0.23027778]


In [179]:
epsilon = 1e-10
onesvect = np.ones((1, nclasses))

for k in range(y_train.shape[0]):
    theta_plus = np.copy(theta)
    theta_plus[:,k] = theta_plus[:,k] + epsilon
    theta_minus = np.copy(theta)
    theta_minus[:,k] = theta_minus[:,k] - epsilon

    Svals_plus = np.exp(theta_plus.T @ X_train)
    sigsoftk_plus = Svals_plus / (onesvect @ Svals_plus)

    Svals_minus = np.exp(theta_minus.T @ X_train)
    sigsoftk_minus = Svals_minus / (onesvect @ Svals_minus)
    
    cost_plus = -(y_train * np.log(sigsoftk_plus)).sum(axis=1) / X_train.shape[1]
    cost_minus = -(y_train * np.log(sigsoftk_minus)).sum(axis=1) / X_train.shape[1]

    grad = (cost_plus - cost_minus) / (2 * epsilon)
    print(f"Numerical grad for class {k}: {grad}")

Numerical grad for class 0: [-2.20888863  1.73833364  1.77138887]
Numerical grad for class 1: [ 1.10444487 -3.47666618  1.77138887]
Numerical grad for class 2: [ 1.10444487  1.73833364 -3.54277746]


In [186]:
alpha = 0.1
for i in range(100):
    grads = np.zeros_like(theta)
    for k in range(y_train.shape[0]):
        grad = ((sigsoftk - y_train)[k,:] * X_train).sum(axis=1) / X_train.shape[1]
        grads[:,k] = grad
    
    theta = theta - alpha * grads
    Svals = np.exp(theta.T @ X_train)
    Svalstot = onesvect @ Svals
    sigsoftk = Svals / Svalstot
    print(f"Cost at iteration {i}: {-(y_train * np.log(sigsoftk)).sum() / X_train.shape[1]}")

Cost at iteration 0: 0.7071795173842598
Cost at iteration 1: 0.6904500363239826
Cost at iteration 2: 0.6752485443221061
Cost at iteration 3: 0.6613736049416623
Cost at iteration 4: 0.648655121082047
Cost at iteration 5: 0.6369489451929752
Cost at iteration 6: 0.6261327153049528
Cost at iteration 7: 0.6161019880998508
Cost at iteration 8: 0.6067675660031928
Cost at iteration 9: 0.5980526807906635
Cost at iteration 10: 0.5898914875394001
Cost at iteration 11: 0.5822269088306579
Cost at iteration 12: 0.5750102142254339
Cost at iteration 13: 0.568199359748727
Cost at iteration 14: 0.5617601546213253
Cost at iteration 15: 0.5556655733889159
Cost at iteration 16: 0.5499010299914993
Cost at iteration 17: 0.5444672008947435
Cost at iteration 18: 0.5394007277383752
Cost at iteration 19: 0.5347874148170592
Cost at iteration 20: 0.5308532417335939
Cost at iteration 21: 0.5279749764032333
Cost at iteration 22: 0.5271275381233116
Cost at iteration 23: 0.5294553888790171
Cost at iteration 24: 0.5385

In [174]:
theta

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])