In [6]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#from IPython.display import display, Markdown, Latex
#from sklearn.datasets import make_blobs
#from matplotlib.widgets import Slider

In [7]:
def my_softmax(z):
    ez = np.exp(z)              #element-wise exponenial
    sm = ez/np.sum(ez)
    return(sm)

In [53]:
# make  dataset for example
centers = [[-5, 2], [-2, -2], [1, 2], [5, -2]]
#X_train, y_train = make_blobs(n_samples=2000, centers=centers, cluster_std=1.0,random_state=30)
X_train=np.array([[-5, 2], [-2, -2], [1, 2], [5, -2]])
y_train=np.array([0,1,2,3]).reshape(-1,1)     # remember label shoul start from 0 as it must be < size of len of it

In [54]:
X=np.tile(X_train,(100,1))
#X=X_train
y=np.tile(y_train,(100,1))
y=y[:,0]

In [55]:
"""The Obvious organization¶
The model below is implemented with the softmax as an activation in the final Dense layer. The loss function is separately specified in the compile directive.

The loss function is SparseCategoricalCrossentropy. This loss is described in (3) above. In this model, the softmax takes place in the last layer. 
The loss function takes in the softmax output which is a vector of probabilities."""

print(X[:9])
print(y[:9])
print(len(X),len(y))

[[-5  2]
 [-2 -2]
 [ 1  2]
 [ 5 -2]
 [-5  2]
 [-2 -2]
 [ 1  2]
 [ 5 -2]
 [-5  2]]
[0 1 2 3 0 1 2 3 0]
400 400


In [56]:
model = Sequential(
    [ 
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(4, activation = 'softmax')    # < softmax activation here
    ]
)
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

model.fit(
    X,y,epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6b0583ff10>

In [57]:
#Because the softmax is integrated into the output layer, the output is a vector of probabilities.

In [58]:
p_nonpreferred = model.predict(X_train)
print(p_nonpreferred [:5])
print("largest value", np.max(p_nonpreferred), "smallest value", np.min(p_nonpreferred))

[[7.6743430e-01 1.8939564e-01 2.7855283e-02 1.5314807e-02]
 [3.2058734e-02 9.0862471e-01 4.1922186e-02 1.7394384e-02]
 [3.1962457e-01 2.5320107e-01 2.8336248e-01 1.4381191e-01]
 [7.8009553e-03 3.6172342e-02 4.0040983e-04 9.5562631e-01]]
largest value 0.9556263 smallest value 0.00040040983


In [65]:
p=np.arange(1,21).reshape(5,-1)
print(p)
print(np.max(p[0,:]))

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]
 [17 18 19 20]]
4


In [66]:
"""Preferred 
Recall from lecture, more stable and accurate results can be obtained if the softmax and loss are combined during training.
 This is enabled by the 'preferred' organization shown here.

In the preferred organization the final layer has a linear activation. For historical reasons, the outputs in this form are referred to as logits. 
The loss function has an additional argument: from_logits = True. This informs the loss function that the softmax operation should be included in the loss calculation. 
This allows for an optimized implementation."""

"Preferred \nRecall from lecture, more stable and accurate results can be obtained if the softmax and loss are combined during training.\n This is enabled by the 'preferred' organization shown here.\n\nIn the preferred organization the final layer has a linear activation. For historical reasons, the outputs in this form are referred to as logits. \nThe loss function has an additional argument: from_logits = True. This informs the loss function that the softmax operation should be included in the loss calculation. \nThis allows for an optimized implementation."

In [67]:
preferred_model = Sequential(
    [ 
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(4, activation = 'linear')   #<-- Note
    ]
)
preferred_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  #<-- Note
    optimizer=tf.keras.optimizers.Adam(0.001),
)

preferred_model.fit(
    X_train,y_train,
    epochs=10
)
        

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6b0bfca800>

In [68]:
#Notice that in the preferred model, the outputs are not probabilities, but can range from large negative numbers to large positive numbers. The output must be sent through a softmax when performing a prediction that expects a probability.
# Let's look at the preferred model outputs:

In [97]:
p_preferred = preferred_model.predict(X)
print(f"two example output vectors:\n {p_preferred[:9]}")
print("largest value", np.max(p_preferred), "smallest value", np.min(p_preferred))

two example output vectors:
 [[ 0.50100267  0.16643001 -0.3375221  -0.38199842]
 [-0.2661833  -0.94846386 -0.58020353  0.37000045]
 [ 0.01260066  0.10695031  0.16173512 -0.2072627 ]
 [ 0.9196835  -0.12825637 -0.29331413  0.5713747 ]
 [ 0.50100267  0.16643001 -0.3375221  -0.38199842]
 [-0.2661833  -0.94846386 -0.58020353  0.37000045]
 [ 0.01260066  0.10695031  0.16173512 -0.2072627 ]
 [ 0.9196835  -0.12825637 -0.29331413  0.5713747 ]
 [ 0.50100267  0.16643001 -0.3375221  -0.38199842]]
largest value 0.91968364 smallest value -0.94846386


In [98]:
sm_preferred = tf.nn.softmax(p_preferred).numpy() #.numpy() for converting tensorflow rep to numpy rep
print(f"two example output vectors:\n {sm_preferred[:2]}")
print("largest value", np.max(sm_preferred), "smallest value", np.min(sm_preferred))


two example output vectors:
 [[0.39039144 0.27938122 0.1687849  0.16144247]
 [0.24241106 0.1225298  0.1770823  0.45797685]]
largest value 0.45797685 smallest value 0.122529805


In [99]:
sm_preferred = list()
for i in p_preferred:
  sm_preferred.append(my_softmax(i))
sm_preferred=np.array(sm_preferred)
print(f"two example output vectors:\n {sm_preferred[:2]}")
print("largest value", np.max(sm_preferred), "smallest value", np.min(sm_preferred))



two example output vectors:
 [[0.3903914  0.27938125 0.16878492 0.16144247]
 [0.24241108 0.1225298  0.17708232 0.4579768 ]]
largest value 0.4579768 smallest value 0.1225298


In [100]:
#To select the most likely category, the softmax is not required. One can find the index of the largest output using np.argmax().

p=np.random.rand(5)
print(p)
print(np.argmax(p))

[0.21759378 0.78541515 0.95661636 0.17922873 0.82137044]
2


In [101]:
for i in range(5):
    print( f"{p_preferred[i]}, category: {np.argmax(p_preferred[i])}")

[ 0.50100267  0.16643001 -0.3375221  -0.38199842], category: 0
[-0.2661833  -0.94846386 -0.58020353  0.37000045], category: 3
[ 0.01260066  0.10695031  0.16173512 -0.2072627 ], category: 2
[ 0.9196835  -0.12825637 -0.29331413  0.5713747 ], category: 0
[ 0.50100267  0.16643001 -0.3375221  -0.38199842], category: 0


In [102]:
"""SparseCategorialCrossentropy or CategoricalCrossEntropy¶
Tensorflow has two potential formats for target values and the selection of the loss defines which is expected.

SparseCategorialCrossentropy: expects the target to be an integer corresponding to the index. For example, if there are 10 potential target values, y would be between 0 and 9.
CategoricalCrossEntropy: Expects the target value of an example to be one-hot encoded where the value at the target index is 1 while the other N-1 entries are zero.
 An example with 10 potential target values, where the target is 2 would be [0,0,1,0,0,0,0,0,0,0]."""

'SparseCategorialCrossentropy or CategoricalCrossEntropy¶\nTensorflow has two potential formats for target values and the selection of the loss defines which is expected.\n\nSparseCategorialCrossentropy: expects the target to be an integer corresponding to the index. For example, if there are 10 potential target values, y would be between 0 and 9.\nCategoricalCrossEntropy: Expects the target value of an example to be one-hot encoded where the value at the target index is 1 while the other N-1 entries are zero.\n An example with 10 potential target values, where the target is 2 would be [0,0,1,0,0,0,0,0,0,0].'