In [None]:
import numpy as np
import matplotlib.pyplot as plt

#画图
fig = plt.figure(figsize=(5, 5))
ax = plt.gca()

plt.plot([1, 1], [0, 1], color='blue', linewidth=2)
plt.plot([1, 2], [2, 2], color='b', linewidth=2)
plt.plot([2, 2], [2, 1], color='b', linewidth=2)
plt.plot([2, 3], [1, 1], color='b', linewidth=2)

plt.text(0.5, 2.5, 'Our base', size=14, ha='center')
plt.text(1.5, 2.5, 'S1', size=14, ha='center')
plt.text(2.5, 2.5, 'S2', size=14, ha='center')
plt.text(0.5, 1.5, 'S3', size=14, ha='center')
plt.text(1.5, 1.5, 'S4', size=14, ha='center')
plt.text(2.5, 1.5, 'S5', size=14, ha='center')
plt.text(0.5, 0.5, 'S6', size=14, ha='center')
plt.text(1.5, 0.5, 'S7', size=14, ha='center')
plt.text(2.5, 0.5, 'enemy base', size=14, ha='center')

ax.set_xlim(0, 3)
ax.set_ylim(0, 3)
plt.tick_params(axis='both', which='both', bottom='off', top='off',
                labelbottom='off', right='off', left='off', labelleft='off')

line, = ax.plot([0.5], [2.5], marker="o", color='g', markersize=60)
line, = ax.plot([2.5], [0.5], marker="o", color='red', markersize=60)
plt.show()

# 设定初始的theta_0

# 矩阵的行：状态0～7，也就是每个格子所对应的状态
#矩阵的列：表示上下左右的action，不可移动就是nan
theta_0 = np.array([[np.nan, 1, 1, np.nan],  # s0
                    [np.nan, 1, np.nan, 1],  # s1
                    [np.nan, np.nan, 1, 1],  # s2
                    [1, 1, 1, np.nan],  # s3
                    [np.nan, np.nan, 1, 1],  # s4
                    [1, np.nan, np.nan, np.nan],  # s5
                    [1, np.nan, np.nan, np.nan],  # s6
                    [1, 1, np.nan, np.nan],  # s7、※s8
                    ])

def simple_convert_into_pi_from_theta(theta):
    '''转化为概率'''

    [m, n] = theta.shape
    pi = np.zeros((m, n))
    for i in range(0, m):
        pi[i, :] = theta[i, :] / np.nansum(theta[i, :])  # 计算百分比例

    pi = np.nan_to_num(pi)  # nan=0

    return pi

# 求初始的策略
pi_0 = simple_convert_into_pi_from_theta(theta_0)

# 设定初始Q函数

[a, b] = theta_0.shape
Q = np.random.rand(a, b) * theta_0 * 0.1


def get_action(s, Q, epsilon, pi_0):
    direction = ["up", "right", "down", "left"]

    if np.random.rand() < epsilon:
        next_direction = np.random.choice(direction, p=pi_0[s, :])
    else:
        next_direction = direction[np.nanargmax(Q[s, :])]

    if next_direction == "up":
        action = 0
    elif next_direction == "right":
        action = 1
    elif next_direction == "down":
        action = 2
    elif next_direction == "left":
        action = 3

    return action


def get_s_next(s, a, Q, epsilon, pi_0):
    direction = ["up", "right", "down", "left"]
    next_direction = direction[a]


    if next_direction == "up":
        s_next = s - 3
    elif next_direction == "right":
        s_next = s + 1
    elif next_direction == "down":
        s_next = s + 3
    elif next_direction == "left":
        s_next = s - 1

    return s_next

def Q_learning(s, a, r, s_next, Q, eta, gamma):

    if s_next == 8:  # 到达目的地
        Q[s, a] = Q[s, a] + eta * (r - Q[s, a])

    else:
        Q[s, a] = Q[s, a] + eta * (r + gamma * np.nanmax(Q[s_next,: ]) - Q[s, a])

    return Q

def goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi):
    s = 0
    a = a_next = get_action(s, Q, epsilon, pi)
    s_a_history = [[0, np.nan]]

    while (1):
        a = a_next  # 行動更新

        s_a_history[-1][1] = a

        s_next = get_s_next(s, a, Q, epsilon, pi)

        s_a_history.append([s_next, np.nan])

        if s_next == 8:
            r = 1
            a_next = np.nan
        else:
            r = 0
            a_next = get_action(s_next, Q, epsilon, pi)



        Q = Q_learning(s, a, r, s_next, Q, eta, gamma)


        if s_next == 8:
            break
        else:
            s = s_next

    return [s_a_history, Q]

eta = 0.1  # 学习率
gamma = 0.9  # 时间折扣率
epsilon = 0.5  # ε-greedy初始值
v = np.nanmax(Q, axis=1)
is_continue = True
episode = 1

V = []  # 存放每回合状态价值
V.append(np.nanmax(Q, axis=1))  # 求最大值

while is_continue:
    print("回合数:" + str(episode))

    epsilon = epsilon / 2

    # 更新Q
    [s_a_history, Q] = goal_maze_ret_s_a_Q(Q, epsilon, eta, gamma, pi_0)


    new_v = np.nanmax(Q, axis=1)
    print(np.sum(np.abs(new_v - v)))
    v = new_v
    V.append(v)

    print("求迷宫需要：" + str(len(s_a_history) - 1) + "步")


    episode = episode + 1
    if episode > 100:
        break