In [1]:
import pandas as pd
import numpy as np
import time
from IPython import display # 引入 display 模块目的方便程序运行展示

def init_env():
    start=(0, 0)
    terminal=(3, 2)
    hole=(2, 1)
    env = np.array([['_ '] * 4] * 4) # 建立一个 4*4 的环境 
    env[terminal] = '$ ' # 目的地
    env[hole] = '# ' # 陷阱
    env[start] = 'L '# 小狮子
    interaction = ''
    for i in env:
        interaction += ''.join(i) + '\n'
    print(interaction)

init_env()

L _ _ _ 
_ _ _ _ 
_ # _ _ 
_ _ $ _ 



In [2]:
'''Q-Table 初始化
'''
def init_q_table():
    actions = np.array(['up', 'down', 'left', 'right'])
    q_table = pd.DataFrame(np.zeros((16, len(actions))), columns=actions)  # 初始化 Q-Table 全为 0
    return q_table

init_q_table()

Unnamed: 0,up,down,left,right
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


In [3]:
def act_choose(state, q_table, epsilon):
    """
    参数:
    state -- 状态
    q_table -- Q-Table
    epsilon -- 概率值

    返回:
    action --下一步动作
    """
    state_act = q_table.iloc[state, :]
    actions = np.array(['up', 'down', 'left', 'right'])

    if (np.random.uniform() > epsilon or state_act.all() == 0):
        action = np.random.choice(actions)
    else:
        action = state_act.idxmax()
    return action

In [4]:
"""行为反馈
"""

def env_feedback(state, action, hole, terminal):
    """
    参数:
    state -- 状态
    action -- 动作
    hole -- 陷阱位置
    terminal -- 终点位置

    返回:
    next_state -- 下一状态
    reward -- 奖励
    end --结束标签
    """
    reward = 0.
    end = 0
    a, b = state
    if action == 'up':
        a -= 1
        if a < 0:
            a = 0
        next_state = (a, b)
    elif action == 'down':
        a += 1
        if a >= 4:
            a = 3
        next_state = (a, b)
    elif action == 'left':
        b -= 1
        if b < 0:
            b = 0
        next_state = (a, b)
    elif action == 'right':
        b += 1
        if b >= 4:
            b = 3
        next_state = (a, b)

    if next_state == terminal:
        reward = 10.
        end = 2
    elif next_state == hole:
        reward = -10.
        end = 1
    else:
        reward = -1.
    return next_state, reward, end

In [5]:
def update_q_table(q_table, state, action, next_state, terminal, gamma, alpha, reward):
    """
    参数:
    q_table -- Q-Table
    state -- 状态
    action -- 动作
    next_state -- 下一状态
    terminal -- 终点位置
    gamma -- 折损因子
    alpha -- 学习率
    reward -- 奖励

    返回:
    q_table -- 更新后的Q-Table
    """
    x, y = state
    next_x, next_y = next_state
    q_original = q_table.loc[x * 4 + y, action]
    if next_state != terminal:
        q_predict = reward + gamma * q_table.iloc[next_x * 4 + next_y].max()
    else:
        q_predict = reward
    q_table.loc[x * 4 + y, action] = (1-alpha) * q_original+alpha*q_predict
    return q_table

In [6]:
"""状态可视化辅助函数
"""

def show_state(end, state, episode, step, q_table):
    """
    参数:
    end -- 结束标签
    state -- 状态
    episode -- 迭代次数
    step --迭代步数
    q_table-- Q-Table
    """
    terminal = (3, 2)
    hole = (2, 1)
    env = np.array([['_ '] * 4] * 4)
    env[terminal] = '$ '
    env[hole] = '# '
    env[state] = 'L '
    interaction = ''
    for i in env:
        interaction += ''.join(i) + '\n'

    if state == terminal:
        message = 'EPISODE: {}, STEP: {}'.format(episode, step)
        interaction += message
        display.clear_output(wait=True)  # 清除输出内容
        print(interaction)
        print("\n"+"q_table:")
        print(q_table)
        time.sleep(3)  # 在成功到终点时，等待 3 秒
    else:
        display.clear_output(wait=True)
        print(interaction)
        print(q_table)
        time.sleep(0.3)  # 在这里控制每走一步所需要时间

In [7]:
def q_learning(max_episodes, alpha, gamma, epsilon):
    """
    参数:
    max_episodes -- 最大迭代次数
    alpha -- 学习率
    gamma -- 折损因子
    epsilon -- 概率值

    返回:
    q_table -- 更新后的Q-Table
    """
    q_table = init_q_table()
    terminal = (3, 2)
    hole = (2, 1)
    episodes = 0
    while(episodes <= max_episodes):
        step = 0
        state = (0, 0)
        end = 0
        show_state(end, state, episodes, step, q_table)
        while(end == 0):
            x, y = state
            act = act_choose(x * 4 + y, q_table, epsilon)  # 动作选择
            next_state, reward, end = env_feedback(
                state, act, hole, terminal)  # 环境反馈
            q_table = update_q_table(
                q_table, state, act, next_state, terminal, gamma, alpha, reward)  # q-table 更新
            state = next_state
            step += 1
            show_state(end, state, episodes, step, q_table)
        if end == 2:
            episodes += 1

In [8]:
q_learning(max_episodes=10, alpha=0.8, gamma=0.9, epsilon=0.9)

_ _ _ _ 
_ _ _ _ 
_ # _ _ 
_ _ L _ 
EPISODE: 10, STEP: 7

q_table:
          up      down      left     right
0  -2.991758  3.113593  0.463113 -2.891725
1  -2.836070 -2.294528 -2.370161 -2.604380
2  -2.242560 -1.894554 -2.097920 -2.213120
3  -1.683200 -1.651200 -2.236160 -1.536000
4  -3.186036  4.578304 -2.381046 -2.299443
5  -2.314445 -8.000000 -2.350940 -1.833370
6  -2.425037 -0.999987 -1.491200 -1.568000
7  -1.536000 -0.998400 -1.651200 -1.536000
8  -1.514240  6.199745  3.469813 -9.920000
9   0.000000  0.000000  0.000000  0.000000
10 -1.861202  8.000000 -9.920000 -0.960000
11 -1.821440  0.000000 -0.992000  0.000000
12  3.309957 -0.960000 -0.960000  7.999991
13 -8.000000  7.980390  6.029653  9.999999
14  0.000000  0.000000  0.000000  0.000000
15  0.000000  0.000000  0.000000  0.000000
