In [82]:
from sklearn.datasets import load_iris
import numpy as np

In [83]:
iris=load_iris(as_frame=True)

In [84]:
iris.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [85]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [86]:
# .values return a Numpy representation of the df.
X=iris.data[['petal length (cm)','petal width (cm)']].values
Y=iris["target"].values

Preprocessing

In [87]:
#adding bias to each instances
X_w_bias=np.c_[np.ones(len(X)),X]
len(X_w_bias)

150

In [88]:
# Test Ratio: 0.2
# Validation Ratio: 0.2
# Need to convert to int else cannot slice
np.random.seed(42)
total_size=len(X_w_bias)
test_size=int(total_size*0.2)
validation_size=int(total_size*0.2)
train_size=int(total_size-(validation_size+test_size))

random_index=np.random.permutation(total_size)

X_train = X_w_bias[random_index[:train_size]]
Y_train=Y[random_index[:train_size]]

X_valid=X_w_bias[random_index[train_size:-test_size]]
Y_valid=Y[random_index[train_size:-test_size]]

X_test=X_w_bias[random_index[-test_size:]]
Y_test=Y[random_index[-test_size:]]


In [89]:
#numpy way of ohe
# if a is a NumPy array, then a[[1, 3, 2]] 
# returns an array with 3 rows equal to a[1], a[3] and a[2] 
def to_one_hot(Y):
    return np.diag(np.ones(Y.max() + 1))[Y]

In [90]:
Y_train_ohe=to_one_hot(Y_train)
Y_valid_ohe=to_one_hot(Y_valid)
Y_test_ohe=to_one_hot(Y_test)


In [91]:
#Scaling data Excluding Bias
s=X_train[:,1:].std(axis=0)
m=X_train[:,1:].mean(axis=0)

X_train[:,1:]=(X_train[:,1:]-m)/s
X_test[:,1:]=(X_test[:,1:]-m)/s
X_valid[:,1:]=(X_valid[:,1:]-m)/s

In [92]:
n_inputs = X_train.shape[1]  # == 3 (2 features plus the bias term)
n_outputs = len(np.unique(Y))  # == 3 (there are 3 iris classes)


In [93]:
#softmax function
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis=1, keepdims=True)
    return exps / exp_sums
    

In [94]:
def cross_entropy_loss(Y_true, Y_proba, epsilon=1e-5):
    #epsilon is a small constant added to avoid log(0), set to 1e-5.
    m = Y_true.shape[0]

    xentropy_losses = -(Y_true * np.log(Y_proba + epsilon))
    return xentropy_losses.sum()/m

In [95]:
#debugging
print(Y_valid_ohe.shape)
print(X_valid.shape)

(30, 3)
(30, 3)


Training with early stopping

In [96]:
eta = 0.4 # learning rate
n_epochs = 30000

epsilon = 1e-5

np.random.seed(42)

patience_counter=0

max_patience=5

best_loss = np.inf

#initializes the weights (Theta) randomly
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    #softmax score formula
    #training X_train
    logits = X_train @ Theta
    
    Y_proba = softmax(logits)

    #compute loss for every 1000 epochs
    if epoch % 1000 == 0:
        #cross entropy on validation set
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_losses = cross_entropy_loss(Y_valid_ohe,Y_proba_valid)
        print(f'current epoch: {epoch}, xentropy_losses: {xentropy_losses}')

        if xentropy_losses<best_loss:
            best_loss=xentropy_losses
            patience_counter=0
        else:
            patience_counter+=1

        if patience_counter>max_patience:
            print('Early Stopping NOW!')
            break


    error = Y_proba - Y_train_ohe
    
    
    gradients = 1 / len(X_train) * X_train.T @ error
    
    Theta = Theta - eta * gradients

Y_proba_valid = softmax(X_valid @ Theta)
final_loss = cross_entropy_loss(Y_valid_ohe, Y_proba_valid)
print(f'Final validation loss: {final_loss}')

current epoch: 0, xentropy_losses: 3.7085808486476917
current epoch: 1000, xentropy_losses: 0.1495820404906828
current epoch: 2000, xentropy_losses: 0.13540154847307986
current epoch: 3000, xentropy_losses: 0.12563653832371222
current epoch: 4000, xentropy_losses: 0.11856991608848257
current epoch: 5000, xentropy_losses: 0.11373607223021413
current epoch: 6000, xentropy_losses: 0.11060518361142271
current epoch: 7000, xentropy_losses: 0.1086800041113589
current epoch: 8000, xentropy_losses: 0.10757166633306339
current epoch: 9000, xentropy_losses: 0.10700450245408859
current epoch: 10000, xentropy_losses: 0.10679240635369075
current epoch: 11000, xentropy_losses: 0.10681236180759407
current epoch: 12000, xentropy_losses: 0.10698352035063056
current epoch: 13000, xentropy_losses: 0.10725261265613953
current epoch: 14000, xentropy_losses: 0.10758430809572092
current epoch: 15000, xentropy_losses: 0.10795498173090191
current epoch: 16000, xentropy_losses: 0.10834870723473895
Early Stoppin

In [97]:
#current weights
Theta

array([[ 0.83952886,  7.64029406, -7.47368453],
       [-8.20116368, -2.31041246, 11.56631566],
       [-6.5302452 ,  0.93794597,  7.46947239]])

Predictions for validation set

In [98]:
logits=X_valid @ Theta
predicted_proba=softmax(logits)
predicted_labels=np.argmax(predicted_proba,axis=1)
acc_score=(predicted_labels==Y_valid).mean()
print(f'accuracy score for validation set: {acc_score}')

accuracy score for validation set: 0.9333333333333333


Predictions for test set

In [99]:
logits=X_test@Theta
predicted_proba=softmax(logits)
predicted_labels=np.argmax(predicted_proba,axis=1)
acc_score=(predicted_labels==Y_test).mean()
print(f'accuracy score for test set: {acc_score}')

accuracy score for test set: 0.9666666666666667
