In [1]:
import numpy as np
from copy import deepcopy

In [60]:
class Softmax:
    """
    Softmax-Regression with ridge-regulation and early stopping.
    """
    def __init__(self, eta: float=0.1, max_iter: int=5000, epsilon: float=1e-7, alpha: float=0.1, stop_at_first_minimum: bool=False, validation_size: float=0.2):
        """
        Instantiate softmax regression
        Args:
            eta: learning rate (step-size). Default 0.1
            max_iter: the maximum number of training epochs. Default 5000
            epsilon: tiny value which is added to the result of the softmax function while calculating the loss to avoid nan (Not a Number) values
            alpha: regulatoin-weight for the ridge-regulation (l2-norm)
            validation_size: size of the validation set. Default 0.2
        """
        self.eta = eta
        self.max_iter = max_iter
        self.epsilon = epsilon
        self.alpha = alpha
        self.validation_size = validation_size
        self.stop_at_first_minimum = stop_at_first_minimum
        pass
    
    def _calc_score(self, X: np.ndarray) -> np.ndarray:
        """
        Calculate the softmax-scores
        
        Args:
            X: Dataset with bias term
        """
        return X.dot(self.Theta)
    
    def _calc_softmax(self, scores: np.ndarray) -> np.ndarray:
        """
        Estimate the probabilities for each point and category using the softmax function.
        
        Args:
            scores: softmax-scores
        """
        exps = np.exp(scores)
        exps_sums = np.sum(exps, axis=1, keepdims=True)
        return exps / exps_sums

    def _calc_loss(self, X: np.ndarray, y: np.ndarray, y_proba: np.ndarray) -> float:
        """
        Calculates the loss with the cross entropy as loss function. A l2-regulation is added.
        
        Args:
            X: Dataset with bias term
            y: labels (one-hot encoded)
            y_proba: probabilities for each point and category (from self._calc_softmax)

        Returns:
            Loss
        """
        xentropy_loss = -np.mean(np.sum(y * np.log(y_proba + self.epsilon), axis=1))
        l2_loss = 0.5 * np.sum(np.square(self.Theta[1:]))
        return xentropy_loss + self.alpha * l2_loss
    
    def _calc_gradients(self, X: np.ndarray, y: np.ndarray, y_proba: np.ndarray) -> np.ndarray:
        """
        Calculates the gradients.
        
        Args:
            X: Dataset with bias term
            y: labels (one-hot encoded)
            y_proba: probabilities for each point and category (from self._calc_softmax)
            
        Returns:
            The gradients
        """
        error = y_proba - y
        return 1/self.m * X.T.dot(error) + np.r_[np.zeros([1, self.n_outputs]), self.alpha * self.Theta[1:]]
    
    def split(X: np.ndarray, y: np.ndarray, ratio: float=0.2) -> tuple:
        """
        Splits
        Args:
            X: Dataset
            y: labels
            ratio: size of the second set (the smaller one). Default 0.2
        
        Returns:
            tuple with X_1, X_2, y_1, y_2
        """
        total_size = len(X)
        
        size_two = int(total_size * ratio)
        size_one = total_size - size_two
        
        random_indices = np.random.permutation(total_size)
        
        X_1 = X[random_indices[:size_one]]
        y_1 = y[random_indices[:size_one]]
        
        X_2 = X[random_indices[size_one:]]
        y_2 = y[random_indices[size_one:]]
        return X_1, X_2, y_1, y_2
    
    def transform(self, X: np.ndarray, y=None):
        """
        Encode the labels with one-hot and adds bias term to the X
        
        Args:
            X: Dataset (without bias term)
            y: labels (NOT one-hot encoded). Default None
        
        Returns:
            tuple with X with bias term and a one-hot encoded y or only X if y is None
        """
        X = np.c_[np.ones([len(X), 1]), X]
        if y is not None:
            cats = [y==cat for cat in range(y.max() + 1)]
            y = np.array(cats).astype(np.int).T
            return X, y
        else:
            return X
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Predicts categories for dataset
        
        Args:
            X: Dataset with bias term
        """
        return self._calc_softmax(self._calc_score(X))
    
    def fit_once(self, X: np.ndarray, y: np.ndarray) -> float:
        """
        Args:
            X: Dataset with bias term
            y: Labels (one-hot encoded)
            
        Returns:
            Loss
        """
        self.n_inputs = X.shape[1]
        self.n_outputs = y.shape[1]
        self.m = len(X)
        
        self.Theta = np.random.randn(self.n_inputs, self.n_outputs)
        y_proba = self.predict(X)
        loss = self._calc_loss(X, y, y_proba)
        gradients = self._calc_gradients(X, y, y_proba)
        
        self.Theta = self.Theta - self.eta * gradients
        return loss
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Fits the model with early stopping and l2-regulation
        
        Args:
            X: Dataset with bias term
            y: Labels (one-hot encoded)
        """
        X_train, X_val, y_train, y_val = Softmax.split(X, y, self.validation_size)
        
        self.best_loss = np.infty
        self.best_Theta = None
        self.best_epoch = None
        
        for epoch in range(self.max_iter):
            self.fit_once(X_train, y_train)
            validation_loss = self._calc_loss(X_val, y_val, self.predict(X_val))
            if validation_loss < self.best_loss:
                self.best_loss = validation_loss
                self.best_Theta = deepcopy(self.Theta)
                self.best_epoch = epoch
            else:
                if self.stop_at_first_minimum:
                    break
        if self.best_Theta is not None:
            self.Theta = deepcopy(self.best_Theta)
    
    def fit_transform(self, X: np.ndarray, y: np.ndarray):
        """
        Encodes the labels with one-hot and fits the model.
        
        Args:
            X: Dataset (without bias term)
            y: labels (NOT one-hot encoded)
        """
        X, y = self.transform(X, y)
        self.fit(X, y)

In [3]:
from sklearn.datasets import load_iris

iris = load_iris()
a = iris['data'][:, (2,3)]
b = iris['target']

In [61]:
X_train, X_test, y_train, y_test = Softmax.split(a, b)

In [62]:
clf = Softmax(max_iter=10000, stop_at_first_minimum=False)
clf.fit_transform(X_train, y_train)

In [63]:
y_predict = np.argmax(clf.predict(clf.transform(X_test)), axis=1)

accuracy_score = np.mean(y_predict == y_test)
accuracy_score

0.6

In [64]:
clf.best_epoch

3012

In [65]:
clf.best_loss

0.7172113734524296

In [66]:
clf.best_Theta

array([[ 1.3644425 , -0.83455237, -0.7295832 ],
       [-0.44403057,  0.51176669,  0.19469608],
       [-0.21475755, -0.20064712,  0.46387196]])