# ソフトマックス関数

## モデル化
### ネットワーク構造



### 活性化関数 : ソフトマックス関数(softmax function)
$$
y_{k} \equiv p\left( {C = k} | {\boldsymbol {\mathrm {x}}} \right) = \frac {\mathrm {exp}\left( u_{k} \right)}{\sum _{j=1}^{K}{\mathrm {exp}\left( u_{j} \right)}}\\
u_{k} = \boldsymbol {\mathrm {W}} \boldsymbol {\mathrm {x}} + \boldsymbol {\mathrm {b}}
$$
$$
\sum _{k=1}^{K}{y_{k}} = 1
$$

### 誤差関数 : 交差エントロピー
訓練データ : 
$$
y_{ i }=\left( \begin{matrix} { y }_{ i1 } & { y }_{ i2 } & \dots  & { y }_{ iK-1 } & { y }_{ iK } \end{matrix} \right) ^{ T }
$$
$$
y_{ i }=\left( \begin{matrix} 0 & 0 & \dots  & 1 & 0 \end{matrix} \right) ^{ T }
$$
尤度関数:
$$
L\left( w \right) =\prod _{ i=1 }^{ N }{ p\left( { y_{ i } }|{ { \boldsymbol {\mathrm {x}} }_{ i } }; \boldsymbol {\mathrm {W}} \right)  } =\prod _{ i=1 }^{ N }{ \prod _{ k=1 }^{ K }{ { p\left( { C_{ k } }|{ { { x }_{ i } } }; \boldsymbol {\mathrm {W}} \right)  }^{ { y }_{ ik } } }  } =\prod _{ i=1 }^{ N }{ \prod _{ k=1 }^{ K }{ { ({ y }_{ k }\left( { x }; \boldsymbol {\mathrm {W}} \right) ) }^{ { y }_{ ik } } }  } 
$$
対数尤度関数:
$$
E\left( \boldsymbol {\mathrm {W}} \right) =-\sum _{ i=1 }^{ N }{ \sum _{ k=1 }^{ K }{ { y }_{ ik }\log { { y }_{ k }\left( { x }_{ i }; \boldsymbol {\mathrm {W}} \right)  }  }  } 
$$

$$
\boldsymbol {\mathrm {W}} = arg \min _{ \boldsymbol {\mathrm {W}} }{E\left( \boldsymbol {\mathrm {W}} \right)}
$$

### 学習法 : 
$$
\nabla E \equiv \frac {\partial E}{\partial \boldsymbol {W}} = \left( \frac {\partial E}{\partial \boldsymbol {\mathrm {w}}_{1}}, \dots, \frac {\partial E}{\partial \boldsymbol {\mathrm {w}}_{p}} \right)^{T}\\
\boldsymbol {\mathrm {w}}^{(t+1)} = \boldsymbol {\mathrm {w}}^{(t)} - \epsilon \nabla E
$$


$$
\frac {\partial E}{\partial \boldsymbol {w}_{j}} = - \sum _{i=1}^{N}{\left[ \boldsymbol {y}_{ij} - f_{j}\left( \boldsymbol {\mathrm {x}}_{i} ; \boldsymbol {\mathrm {W}} \right) \right] \boldsymbol {\mathrm {x}}_{i}}
$$

$$
\frac {\partial E}{\partial \boldsymbol {\mathrm {W}}} = - \sum _{i=1}^{N}{\left( \boldsymbol {y}_{i} - \boldsymbol {f}_{j}\left( \boldsymbol {\mathrm {x}}_{i} ; \boldsymbol {\mathrm {W}} \right) \right) \boldsymbol {\mathrm {x}}_{i}^{T}}
$$

#### (バッチ)勾配降下法(gradient descent)
WIP

#### 確率的勾配降下法(stochastic gradient descent)
WIP

#### ミニバッチ勾配降下法(minibatch gradient descent)
WIP

# 実装
## TensorFlowによる実装

In [4]:
import numpy as np
import tensorflow as tf

### モデルの定義

 - パラメータの定義
 - 出力層の定義
 - 誤差関数の定義
 - 最適化手法の定義

In [17]:
M = 2    # 入力データの次元
K = 3    # クラス数
n = 100     # クラスごとのデータ数
N = n * K   # 全データ数


# 重み行列, バイアスベクトルを定義
W = tf.Variable(tf.zeros([M, K]))
b = tf.Variable(tf.zeros([K]))

# 出力層
x = tf.placeholder(tf.float32, shape=[None, M])
y = tf.placeholder(tf.float32, shape=[None, K])
f = tf.nn.softmax(tf.matmul(x, W) + b)

# 誤差関数
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(f), reduction_indices=[1]))

# 最適化手法の定義
LEARNING_RATE = 0.1
train_step = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy)

# 予測
correct_prediction = tf.equal(tf.argmax(f, 1), tf.argmax(y, 1))

 - `tf.reduce_mean` : ミニバッチごとの平均値
 - `tf.reduce_sum` : 合計

### データセットの用意

In [18]:
# 学習データの用意
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

X1 = np.random.randn(n, M) + np.array([0, 10])
X2 = np.random.randn(n, M) + np.array([5, 5])
X3 = np.random.randn(n, M) + np.array([10, 0])
Y1 = np.array([[1, 0, 0] for i in range(n)])
Y2 = np.array([[0, 1, 0] for i in range(n)])
Y3 = np.array([[0, 0, 1] for i in range(n)])

X = np.concatenate((X1, X2, X3), axis=0)
Y = np.concatenate((Y1, Y2, Y3), axis=0)

### セッションの初期化

In [19]:
# ---セッション---

init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)    # ここではじめてモデルの定義で宣言した変数・式の初期化が行われる

### 学習

In [20]:
batch_size = 50    # ミニバッチサイズ
n_batches = N    # batch_size

for epoch in range(20):
    X_, Y_ = shuffle(X, Y)
    
    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        
        sess.run(train_step, feed_dict={x: X_[start:end], y: Y_[start:end]})

### 評価

In [22]:
X_, Y_ = shuffle(X, Y)

classified = correct_prediction.eval(session=sess, feed_dict={x: X_[0:10], y: Y_[0:10]})
prob = f.eval(session=sess, feed_dict={x: X_[0:10]})

print('===[classified]===')
print('{0}'.format(classified))
print('===[prob]=== ')
print('{0}'.format(prob))

===[classified]===
[ True  True  True  True  True  True  True  True  True  True]
===[prob]=== 
[[9.1983430e-04 9.6831191e-01 3.0768214e-02]
 [3.3742201e-02 9.6437895e-01 1.8788164e-03]
 [1.4817037e-07 4.0019002e-02 9.5998079e-01]
 [9.9885333e-01 1.1466896e-03 2.7971214e-10]
 [9.9841142e-01 1.5885374e-03 2.0899114e-09]
 [2.4606669e-02 9.6900392e-01 6.3893991e-03]
 [1.7782869e-03 9.7434843e-01 2.3873348e-02]
 [1.1031040e-01 8.8553250e-01 4.1571436e-03]
 [2.4705239e-08 1.2057335e-02 9.8794264e-01]
 [9.0052523e-03 9.8831844e-01 2.6762991e-03]]


## Kerasによる実装

In [25]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD    # 確率的勾配降下法

# 出力層の定義
model = Sequential()
model.add(Dense(input_dim=M, units=K))    # 層
model.add(Activation('softmax'))    # 活性化関数

# 最適化手法の定義 : 確率的勾配降下法
LEARNING_RATE = 0.1
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=LEARNING_RATE))

# 学習
minibatch_size = 50
model.fit(X, Y, epochs=20, batch_size=minibatch_size)

# 評価
X_, Y_ = shuffle(X, Y)
classes = model.predict_classes(X_[0:10], batch_size=minibatch_size)
prob = model.predict_proba(X_[0:10], batch_size=1)

print('===[classified]===')
print('{0}'.format(np.argmax(model.predict(X_[0:10]), axis=1) == classes))
print('===[prob]=== ')
print('{0}'.format(prob))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
===[classified]===
[ True  True  True  True  True  True  True  True  True  True]
===[prob]=== 
[[1.3822981e-08 1.0878920e-02 9.8912102e-01]
 [7.0527710e-02 9.2783177e-01 1.6405042e-03]
 [1.3877099e-09 4.5900554e-03 9.9540991e-01]
 [9.8722458e-01 1.2775427e-02 2.2502391e-08]
 [6.4101329e-08 3.5402484e-02 9.6459740e-01]
 [1.1571991e-03 9.8733205e-01 1.1510800e-02]
 [2.4606420e-03 9.8484749e-01 1.2691880e-02]
 [8.2469243e-01 1.7530732e-01 2.5589912e-07]
 [1.1482461e-08 8.4868800e-03 9.9151307e-01]
 [1.4993114e-08 3.7999137e-03 9.9620003e-01]]
