# Dieses Notebook war nur zum ausprobieren!!

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()
X = iris['data'][:, (2,3)]
y = iris['target']

Dem Input X muss ein Achsenabschnitt (Bias-Term) hinzugefügt werden: $\theta_0$  
* X (150, 2)  
* X_with_bias (150, 3)

In [9]:
# np.ones erstellt ein (150, 1) array und np.c_ fügt dieses und X zusammen
X_with_bias = np.c_[np.ones([len(X), 1]), X]

Erstelle trainings, validation und test sets

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_with_bias, y, test_size=0.2, random_state=42)

In [17]:
X_train, X_val, y_train, y_val = X_train[:100], X_train[100:], y_train[:100], y_train[100:]

In [29]:
def to_one_hot(y):
    """Die labels werden mithilfe von one-hot umgewandelt"""
    cats = [y==cat for cat in range(y.max() + 1)]
    return np.array(cats).astype(np.int).T

In [30]:
Y_train_one_hot = to_one_hot(y_train)
Y_val_one_hot = to_one_hot(y_val)
Y_test_one_hot = to_one_hot(y_test)

Softmax funktion:
$$\hat p_k = \sigma(s(x())_k = \frac{exp(s_k(x))}{\sum_{j=1}^Kexp(s_j(x))}$$


In [32]:
def softmax(logits):
    """Wendet die Softmax-Funktion auf berechnete Softmax-Score für eine Kategorie an."""
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

Wir brauchen die Anzahl der Merkmale und der Kategorien

In [35]:
n_inputs = X_train.shape[1] # => 3 (2 Merkmale plus den Achsenabschnitt)
n_outputs = Y_test_one_hot.shape[1] # => 3 (drei verschiedene Kategorien)

Kostenfunktion:  
$J(\mathbf{\Theta}) =
- \dfrac{1}{m}\sum\limits_{i=1}^{m}\sum\limits_{k=1}^{K}{y_k^{(i)}\log\left(\hat{p}_k^{(i)}\right)}$

Gradientenvektor:  
$\nabla_{\mathbf{\theta}^{(k)}} \, J(\mathbf{\Theta}) = \dfrac{1}{m} \sum\limits_{i=1}^{m}{ \left ( \hat{p}^{(i)}_k - y_k^{(i)} \right ) \mathbf{x}^{(i)}}$

Note that $\log\left(\hat{p}_k^{(i)}\right)$ may not be computable if $\hat{p}_k^{(i)} = 0$. So we will add a tiny value $\epsilon$ to $\log\left(\hat{p}_k^{(i)}\right)$ to avoid getting `nan` values.

In [65]:
eta = 0.01 # Lernrate
max_iter = 5001
m = len(X_train) # Anzahl Datenpunkte
epsilon = 1e-7 # Verhindert nan Werte beim log-rechnen

Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(max_iter):
    scores = X_train.dot(Theta)
    Y_proba = softmax(scores)
    loss = -np.mean(np.sum(
            Y_train_one_hot * np.log(Y_proba + epsilon), axis=1
    ))
    error = Y_proba - Y_train_one_hot
    if epoch % 500 == 0:
        print(F'Epoch: {epoch}, Loss: {loss}')
        pass
    # TODO: ich habe in meiner eigenen Version zu viel erledigt (np.sum wird gar nicht benötigt...)
    gradients = 1 / m * X_train.T.dot(error)
    Theta = Theta - eta * gradients

Epoch: 0, Loss: 1.2191585568450913
Epoch: 500, Loss: 0.7360576290888219
Epoch: 1000, Loss: 0.6260262212306025
Epoch: 1500, Loss: 0.5577668297803011
Epoch: 2000, Loss: 0.5113008189214988
Epoch: 2500, Loss: 0.47721551030290577
Epoch: 3000, Loss: 0.45078170068234436
Epoch: 3500, Loss: 0.42942239728259474
Epoch: 4000, Loss: 0.4116245988525047
Epoch: 4500, Loss: 0.3964417215267641
Epoch: 5000, Loss: 0.38324935700511437


In [66]:
Theta

array([[ 3.72665469,  0.08499043, -2.26622633],
       [-0.56698803,  0.26471754,  0.03922486],
       [-2.91250873, -0.81754659,  1.30250599]])

In [67]:
# Validierung
scores = X_val.dot(Theta)
Y_proba = softmax(scores)
y_predict = np.argmax(Y_proba, axis=1)
accuracy_score = np.mean(y_predict == y_val)
accuracy_score

0.95

In [83]:
eta = 0.1
max_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1 # Regularisierungs hyperparameter

Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(max_iter):
    scores = X_train.dot(Theta)
    Y_proba = softmax(scores)
    xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 0.5 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    error = Y_proba - Y_train_one_hot
    if epoch % 500 == 0:
        print(F'Itertion: {epoch}, loss: {loss}')
    # Gradienten mit der l2-regulation berechnen
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]

Itertion: 0, loss: 2.9050797476581396
Itertion: 500, loss: 2.9050797476581396
Itertion: 1000, loss: 2.9050797476581396
Itertion: 1500, loss: 2.9050797476581396
Itertion: 2000, loss: 2.9050797476581396
Itertion: 2500, loss: 2.9050797476581396
Itertion: 3000, loss: 2.9050797476581396
Itertion: 3500, loss: 2.9050797476581396
Itertion: 4000, loss: 2.9050797476581396
Itertion: 4500, loss: 2.9050797476581396
Itertion: 5000, loss: 2.9050797476581396


In [89]:
logits = X_val.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_val)
accuracy_score

0.3

In [142]:
# mit regularisierung und early-stopping
# Man könnte zusätzlich noch die lernrate senken
eta = 0.1
max_iter = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1
best_loss = np.infty

Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(max_iter):
    scores = X_train.dot(Theta)
    Y_proba = softmax(scores)
    xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 0.5 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients
    
    # validation
    scores = X_val.dot(Theta)
    Y_proba = softmax(scores)
    xentropy_loss = -np.mean(np.sum(Y_val_one_hot * np.log(Y_proba + epsilon), axis=1))
    l2_loss = 0.5 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    if epoch % 500 == 0:
        print(F'Itertion: {epoch}, loss: {loss}')
    if loss < best_loss:
        best_loss = loss
    else:
        print(F'Itertion: {epoch - 1}, loss: {best_loss} best')
        print(F'Itertion: {epoch}, loss: {loss} stopping!')
        break

Itertion: 0, loss: 1.3401825392701279
Itertion: 500, loss: 0.5814692580768537
Itertion: 1000, loss: 0.5594655198166198
Itertion: 1500, loss: 0.5514292443728923
Itertion: 2000, loss: 0.5479757638386404
Itertion: 2500, loss: 0.5463687674015348
Itertion: 3000, loss: 0.545569548311599
Itertion: 3500, loss: 0.54514768614606
Itertion: 4000, loss: 0.5449129731318272
Itertion: 4500, loss: 0.5447763919251096
Itertion: 5000, loss: 0.5446939580232468


In [139]:
eta

0.1

In [146]:
type(Theta)

numpy.ndarray

In [147]:
Theta.shape

(3, 3)

In [148]:
Theta

array([[ 4.194398  ,  0.3001704 , -4.84803457],
       [-1.11015333,  0.15069875,  0.95945458],
       [-0.44216325, -0.1191537 ,  0.56131696]])

In [143]:
logits = X_val.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_val)
accuracy_score

0.95

In [144]:
logits = X_test.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_test)
accuracy_score

1.0