In [1]:
import numpy as np
from copy import deepcopy

In [84]:
class Softmax:
    """
    Softmax-Regression with ridge-regulation, early stopping and mini-batch option.
    """
    def __init__(self, eta: float=0.1, max_iter: int=5000, epsilon: float=1e-1, alpha: float=0.1, stop_at_first_minimum: bool=False, validation_size: float=0.2, batch_size: int=64):
        """
        Instantiate softmax regression
        Args:
            eta: learning rate (step-size). Default 0.1
            max_iter: the maximum number of training epochs. Default 5000
            epsilon: tiny value which is added to the result of the softmax function while calculating the loss to avoid nan (Not a Number) values
            alpha: regulatoin-weight for the ridge-regulation (l2-norm)
            validation_size: size of the validation set. Default 0.2
            batch_size: Size of a batch (mini-batch learning). Default 512. If None normal batch-learning is used.
        """
        self.eta = eta
        self.max_iter = max_iter
        self.epsilon = epsilon
        self.alpha = alpha
        self.validation_size = validation_size
        self.stop_at_first_minimum = stop_at_first_minimum
        self.batch_size = batch_size
        pass
    
    def _calc_score(self, X: np.ndarray) -> np.ndarray:
        """
        Calculate the softmax-scores
        
        Args:
            X: Dataset with bias term
        """
        return X.dot(self.Theta)
    
    def _calc_softmax(self, scores: np.ndarray) -> np.ndarray:
        """
        Estimate the probabilities for each point and category using the softmax function.
        
        Args:
            scores: softmax-scores
        """
        exps = np.exp(scores)
        exps_sums = np.sum(exps, axis=1, keepdims=True)
        return exps / exps_sums

    def _calc_loss(self, X: np.ndarray, y: np.ndarray, y_proba: np.ndarray) -> float:
        """
        Calculates the loss with the cross entropy as loss function. A l2-regulation is added.
        
        Args:
            X: Dataset with bias term
            y: labels (one-hot encoded)
            y_proba: probabilities for each point and category (from self._calc_softmax)

        Returns:
            Loss
        """
        xentropy_loss = -np.mean(np.sum(y * np.log(y_proba + self.epsilon), axis=1))
        l2_loss = 0.5 * np.sum(np.square(self.Theta[1:]))
        return xentropy_loss + self.alpha * l2_loss
    
    def _calc_gradients(self, X: np.ndarray, y: np.ndarray, y_proba: np.ndarray) -> np.ndarray:
        """
        Calculates the gradients.
        
        Args:
            X: Dataset with bias term
            y: labels (one-hot encoded)
            y_proba: probabilities for each point and category (from self._calc_softmax)
            
        Returns:
            The gradients
        """
        error = y_proba - y
        return 1/self.m * X.T.dot(error) + np.r_[np.zeros([1, self.n_outputs]), self.alpha * self.Theta[1:]]
    
    def split(X: np.ndarray, y: np.ndarray, ratio: float=0.2) -> tuple:
        """
        Splits
        Args:
            X: Dataset
            y: labels
            ratio: size of the second set (the smaller one). Default 0.2
        
        Returns:
            tuple with X_1, X_2, y_1, y_2
        """
        total_size = len(X)
        
        size_two = int(total_size * ratio)
        size_one = total_size - size_two
        
        if size_two == 0:
            raise ValueError(F'More entries are required, currently: {total_size}')
        
        random_indices = np.random.permutation(total_size)
        
        X_1 = X[random_indices[:size_one]]
        y_1 = y[random_indices[:size_one]]
        
        X_2 = X[random_indices[size_one:]]
        y_2 = y[random_indices[size_one:]]
        return X_1, X_2, y_1, y_2
    
    def transform(self, X: np.ndarray, y=None):
        """
        Encode the labels with one-hot and adds bias term to the X
        
        Args:
            X: Dataset (without bias term)
            y: labels (NOT one-hot encoded). Default None
        
        Returns:
            tuple with normalized X with bias term and a one-hot encoded y or only X if y is None
        """
        X = np.c_[np.ones([len(X), 1]), X]
        
        # normalize all features
        X /= X.max()
        
        # one-hot encoding
        if y is not None:
            cats = [y==cat for cat in range(y.max() + 1)]
            y = np.array(cats).astype(np.int).T
            return X, y
        else:
            return X
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predicts categories for dataset
        
        Args:
            X: Dataset with bias term
        """
        return self._calc_softmax(self._calc_score(X))
    
    def fit_once(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Args:
            X: Dataset with bias term
            y: Labels (one-hot encoded)
            
        Returns:
            Loss
        """
        self.n_inputs = X.shape[1]
        self.n_outputs = y.shape[1]
        self.m = len(X)
        
        if self.batch_size is not None and self.batch_size < self.m:
            batch_indices = np.random.randint(0, self.m-1, self.batch_size)
            X = X[batch_indices]
            y = y[batch_indices]
        
        self.Theta = np.random.randn(self.n_inputs, self.n_outputs)
        y_proba = self.predict(X)
        loss = self._calc_loss(X, y, y_proba)
        gradients = self._calc_gradients(X, y, y_proba)
        
        self.Theta = self.Theta - self.eta * gradients
        return loss
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Fits the model with early stopping and l2-regulation
        
        Args:
            X: Dataset with bias term
            y: Labels (one-hot encoded)
        """
        X_train, X_val, y_train, y_val = Softmax.split(X, y, self.validation_size)
        
        self.best_loss = np.infty
        self.best_Theta = None
        self.best_epoch = None
        
        for epoch in range(self.max_iter):
            loss = self.fit_once(X_train, y_train)
            y_val_proba = self.predict(X_val)
            validation_loss = self._calc_loss(X_val, y_val, y_val_proba)
            if epoch % int(self.max_iter / 10) == 0:
                print(F'Train Lss: {loss}')
                print(F'Val. Loss: {validation_loss}')
                print(F'Best Loss: {self.best_loss}')
            if validation_loss < self.best_loss:
                self.best_loss = validation_loss
                self.best_Theta = deepcopy(self.Theta)
                self.best_epoch = epoch
            else:
                if self.stop_at_first_minimum and self.best_loss is not np.infty:
                    break
        if self.best_Theta is not None:
            self.Theta = deepcopy(self.best_Theta)
            print(F'Best Loss: {self.best_loss}')

    
    def fit_transform(self, X: np.ndarray, y: np.ndarray):
        """
        Encodes the labels with one-hot, normalize X and fits the model.
        
        Args:
            X: Dataset (without bias term)
            y: labels (NOT one-hot encoded)
        """
        X, y = self.transform(X, y)
        self.fit(X, y)

In [159]:
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [160]:
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))

In [161]:
clf = Softmax(max_iter=2500, stop_at_first_minimum=False, eta=10, alpha=0.1, batch_size=50)
clf.fit_transform(X_train, y_train)

Train Lss: 390.5148243820718
Val. Loss: 1.6060772909554093
Best Loss: inf
Train Lss: 393.16511742411495
Val. Loss: 1.6046119646228836
Best Loss: 1.598465210164543
Train Lss: 395.17991401953446
Val. Loss: 1.603690934060315
Best Loss: 1.598465210164543
Train Lss: 386.72892016795015
Val. Loss: 1.6036743063424879
Best Loss: 1.598465210164543
Train Lss: 396.3813853231682
Val. Loss: 1.6052236419955723
Best Loss: 1.598465210164543
Train Lss: 397.10733081122726
Val. Loss: 1.603599164524404
Best Loss: 1.598465210164543
Train Lss: 402.7100451375413
Val. Loss: 1.605833758859314
Best Loss: 1.598465210164543
Train Lss: 386.27641615735575
Val. Loss: 1.6041308580606866
Best Loss: 1.598465210164543
Train Lss: 397.73796947656444
Val. Loss: 1.6066126036752406
Best Loss: 1.598465210164543
Train Lss: 400.93564322661547
Val. Loss: 1.6036931815929996
Best Loss: 1.598465210164543
Best Loss: 1.598465210164543


In [162]:
y_predict = np.argmax(clf.predict(clf.transform(X_test[:100])), axis=1)

accuracy_score = np.mean(y_predict == y_test[:100])
accuracy_score

0.08

In [163]:
clf.best_epoch

85

In [164]:
clf.best_loss

1.598465210164543

# Mit iris - schon besser

In [91]:
from sklearn import datasets
iris = datasets.load_iris()

In [92]:
X = iris["data"][:, 3:]  # petal width
y = (iris["target"] == 2).astype(np.int)  # 1 if Iris-Virginica, else 0

In [153]:
X_train, X_test, y_train, y_test = Softmax.split(X, y)

In [154]:
clf = Softmax(max_iter=5000, stop_at_first_minimum=True, batch_size=None)
clf.fit_transform(X_train, y_train)

Train Lss: 0.5298486035576471
Val. Loss: 0.5089781651458902
Best Loss: inf
Best Loss: 0.5089781651458902


In [155]:
y_predict = np.argmax(clf.predict(clf.transform(X_test)), axis=1)

accuracy_score = np.mean(y_predict == y_test)
accuracy_score

0.9666666666666667

In [156]:
clf.best_epoch

0

In [157]:
clf.best_loss

0.5089781651458902

In [158]:
assert clf.Theta.all() == clf.best_Theta.all()