In [1]:
import numpy as np
from matplotlib import pyplot as plt
from tensorflow.keras.datasets.mnist import load_data

In [2]:
(x, y), (tx, ty) = load_data()

In [3]:
x, tx = x / 255., tx / 255.

In [4]:
x, tx = x.reshape(-1, 28 * 28), tx.reshape(-1, 28 * 28)

In [5]:
y, ty = y.reshape(-1, 1), ty.reshape(-1, 1)

In [6]:
x.shape, tx.shape, y.shape, ty.shape

((60000, 784), (10000, 784), (60000, 1), (10000, 1))

In [7]:
# 定义模型常量 
n_featrues = 28 * 28   # 每个图片有 28 * 28 个像素，转成一维的特征数量
n_class = 10    # 总共0-9个数字，共计10个label

In [8]:
# 定义模型

# 前向计算
def softmax_regression(x, w, b):
    # x: 256 x 784  w: 784 x 10
    # 输出是 256 x 10 的矩阵
    # 每一行有10和元素，分别代表模型输出0-9的概率
    logits = np.dot(x, w.T) + b
    exp = np.exp(logits)
    return  exp / np.sum(exp, axis=1, keepdims=True)

# 精度计算函数，计算预测函数的精准度
def accuracy(y_hat, y):
    # 265 x 10
    y_hat = np.argmax(y_hat, axis=1, keepdims=True).astype(y.dtype)
    return (y_hat == y).sum() / len(y)
    

In [9]:
# 测试softmax
test_softmax_w = np.random.randn(n_class, n_featrues) # 模型共 784 x 10 个 权重
test_softmax_b = np.random.randn(1, n_class) # 共 1 x 10个偏置
# 理论上随便预测一次，模型大概有10%的精度
test_softmax_y_hat = softmax_regression(tx, test_softmax_w, test_softmax_b)
# 计算精度
accuracy(test_softmax_y_hat, ty)

0.1004

In [10]:
# one-hot编码
def one_hot(y, n_):
    # 256 x 1
    assert y.shape[1] == 1
    yy = None
    for uni in range(n_):
        if yy is None:
            yy = y == uni
        else:
            yy = np.c_[yy, y == uni]
    return yy.astype('uint8')

#测试one_hot编码
(np.argmax(one_hot(ty, n_=n_class), axis=1) == ty.T[0]).sum() == len(ty)

True

In [11]:
# 定义损失函数---交叉熵
def cross_entropy(y_hat, y):
    # 处理一下y_hat，防止数值上溢和下溢
    y_hat = np.clip(y_hat, 1e-9, 1.)
    # y进行one-hot编码
    y = one_hot(y, n_=n_class)
    # 对y进行one-hot编码之后，非真值列y = 0，不参与计算 即: 1{y_i = j} else 0{y_i != j}
    return -np.mean(np.sum(y * np.log(y_hat), axis=1))

#测试损失计算
cross_entropy(test_softmax_y_hat, ty)

12.919749171071098

In [12]:
# 梯度计算，反向传播
def gradients(y_hat, y, x):
    # 注意，只有y_i = j 才参与计算
    # y进行one-hot编码
    y = one_hot(y, n_=n_class)
    # y: 256 x 10   x: 256 x 784
    # 先计算梯度内部表达式
    # 过滤掉通过*y过滤掉y = 0 的情况，此时y_j.shape: 265 x 1
    y_j = np.sum((y - y_hat), axis=1, keepdims=True)
    # w和b的梯度分开计算
    # 在原有公式上直接乘以y可以消除y != 1的数值，
    # 因为我们的y进行了one-hot编码，只有对应的位置元素为1，其余都为0
    return -((y - y_hat) * y).T.dot(x), -np.mean((y - y_hat) * y)

# 测试计算梯度
gradients(test_softmax_y_hat, ty, tx)

(array([[-0., -0., -0., ..., -0., -0., -0.],
        [-0., -0., -0., ..., -0., -0., -0.],
        [-0., -0., -0., ..., -0., -0., -0.],
        ...,
        [-0., -0., -0., ..., -0., -0., -0.],
        [-0., -0., -0., ..., -0., -0., -0.],
        [-0., -0., -0., ..., -0., -0., -0.]]),
 -0.0898187871206671)

In [13]:
# 开始训练

w = np.random.randn(n_class, n_featrues)
b = np.random.randn(1, n_class)

eta = 1e-5  # 学习率
for i in range(200):
    y_hat = softmax_regression(x, w, b)
    # 直接计算梯度
    grad_w, grad_b = gradients(y_hat, y, x)
    # 更新参数
    w = w - eta * grad_w
    b = b - eta * grad_b
    loss = cross_entropy(y_hat, y)
    print(f'iter[{i}, loss: {loss}, acc: {accuracy(y_hat, y)}')

iter[0, loss: 11.492013268349108, acc: 0.12626666666666667
iter[1, loss: 10.749944114915436, acc: 0.13391666666666666
iter[2, loss: 10.040025392983953, acc: 0.14461666666666667
iter[3, loss: 9.381399693241073, acc: 0.15705
iter[4, loss: 8.782411371849244, acc: 0.16991666666666666
iter[5, loss: 8.241672269580562, acc: 0.18311666666666668
iter[6, loss: 7.75479109970336, acc: 0.1978
iter[7, loss: 7.318364922720481, acc: 0.21271666666666667
iter[8, loss: 6.928432662703104, acc: 0.2276
iter[9, loss: 6.5802741118942185, acc: 0.24418333333333334
iter[10, loss: 6.269860285963929, acc: 0.2597
iter[11, loss: 5.991767733145936, acc: 0.27513333333333334
iter[12, loss: 5.741157497255152, acc: 0.29001666666666664
iter[13, loss: 5.514126910985687, acc: 0.30585
iter[14, loss: 5.307547321997527, acc: 0.32125
iter[15, loss: 5.118621322068707, acc: 0.33541666666666664
iter[16, loss: 4.945057263292816, acc: 0.34958333333333336
iter[17, loss: 4.785120806773232, acc: 0.36346666666666666
iter[18, loss: 4.637