# Temporal Difference

$$
\begin{align*}
&\text{Initialize Q(s,a),}\forall s \in S,a\in A(s),\text{arbitrarily, and Q(terminial-state,-) = 0} \\
&\text{Repeat (for each episode):} \\
&\quad\text{Initialize S} \\
&\quad\text{Choose A from S using policy derived from Q}(e.g.,\epsilon\,greedy) \\
&\quad\text{Repeat (for each step of episode):} \\
&\quad\quad\text{Take action A, observe R, S'} \\
&\quad\quad\text{Choose A' from S' using policy derived from Q}(e.g.,\epsilon\,greedy) \\
&\quad\quad Q(S,A) \leftarrow Q(S,A) + \alpha[R + \gamma Q(S', A') - Q(S,A)] \\
&\quad\quad S \leftarrow S'; A \leftarrow A';\\
&\quad\text{until S is terminal} \\
\end{align*}
$$

初始化 gym 环境

In [2]:
import gym
import numpy as np
np.random.seed(12345)

env = gym.make('LunarLander-v2')
env.seed(seed=12345)

print('action_space:', env.action_space)
print('observation_space:', env.observation_space)

print('action sample:', env.action_space.sample())
print('observation sample:', env.observation_space.sample())

action_space: Discrete(4)
observation_space: Box(8,)
action sample: 2
observation sample: [ 0.8233951   0.77894795 -0.7034708   0.72476166 -1.201629    0.08866955
  0.41255188  0.6011491 ]


### 数据预处理

这里的 observation_space 是一个 8 维的连续值,这里使用 [Tile-Coding](./Tile-Coding.ipynb#Tile-Coding) 转变成离散值:

(0.50585246 -0.68311024  0.7191891  -0.65380216  0.14889328 -0.72178286
 -0.2422974  -0.5457825) <=> 0

In [3]:
from math import floor
def tiles(ihtORsize, numtilings, floats):
    qfloats = [floor(f * numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append((q + b) // numtilings)
            b += tilingX2
        Tiles.append(hash(tuple(coords)) % ihtORsize)
    return Tiles

In [4]:
print(tiles(10, 8, env.observation_space.sample()))

[0, 6, 5, 9, 5, 0, 5, 9]


定义一个 dict 用于 obs2state 的转换

In [5]:
obs_dict = {}
def init_obs_state():
    global obs_dict
    obs_dict = {}

def obs2state(obs):
    scale_obs = lambda obs: obs * 0.5
    t = tuple(tiles(10, 8, scale_obs(obs)))
    if t not in obs_dict:
        l = len(obs_dict)
        obs_dict[t] = l
    return obs_dict[t]

In [6]:
init_obs_state()
for i in range(10000):
    obs2state(env.observation_space.sample())
print(len(obs_dict))

9989


### $\epsilon$-Greedy 探索

- 最简单的方法确保持续探索
- 所有 m 个动作都采取非零概率
- $1 - \epsilon$ 概率: 选择贪婪动作
- $\epsilon$ 概率: 选择随机动作

则:
$$
\pi(a|s) = \begin{cases} \frac\epsilon m + 1 - \epsilon & (a^* == \mathop{argmax}_{a\in A} Q(s, a)) \\ \frac\epsilon m & (otherwise) \end{cases}
$$

In [7]:
def epsilon_greedy_policy(a, epsilon):
    m = len(a)
    assert m > 0, 'a length must large then 0'
    p_a = epsilon / m
    p_a_star = 1. - epsilon + p_a
    action_probs = np.ones(m) * p_a
    action_probs[np.argmax(a)] = p_a_star
    np.testing.assert_allclose(np.sum(action_probs), 1)
    act_num = np.random.multinomial(1, action_probs).argmax()
    return act_num

因为将 **observation_space** 连续值转换成了离散值, 导致 state 的数量不确定, 因此不能使用以前的初始化函数:

```
q_s_a = np.random.rand(env.observation_space.n, np.prod([act_space.n for act_space in env.action_space]))
```

改用 **collections** 中的 _defaultdict_ 来实现:

In [8]:
from collections import defaultdict

q_s_a = defaultdict(np.random.rand) 
def init_q_value():
    global q_s_a
    q_s_a = defaultdict(np.random.rand) 

print(epsilon_greedy_policy([q_s_a[(0, aa)] for aa in range(env.action_space.n)], 0.1))
print(q_s_a)

0
defaultdict(<built-in method rand of mtrand.RandomState object at 0x1117820f0>, {(0, 0): 0.9296160928171479, (0, 1): 0.3163755545817859, (0, 2): 0.18391881167709445, (0, 3): 0.2045602785530397})


### 动作选择

In [9]:
def choose_action(obs, greedy = False):
    s = obs2state(obs)
    actions = [q_s_a[(s, aa)] for aa in range(env.action_space.n)]
    if greedy:
        return np.argmax(actions)
    else:
        return epsilon_greedy_policy(actions, 0.1)

In [10]:
print(choose_action(env.observation_space.sample()))

1


### 更新 q_value

$$
Q(S,A) \leftarrow Q(S,A) + \alpha[R + \gamma Q(S', A') - Q(S,A)]
$$

In [11]:
def update_q_value(s, a, alpha, gamma, reward, s_next, a_next):
    q_s_a[(s, a)] = q_s_a[(s, a)] + alpha * (reward + gamma * q_s_a[(s_next, a_next)] - q_s_a[(s, a)])

### 每个 episode:

In [12]:
def run_episode(max_step):
    obs = env.reset()
    s = obs2state(obs)
    a = choose_action(obs)
    for i in range(max_step):
        obs, reward, done, info = env.step(a)
        
        s_ = obs2state(obs) if not done else None
        a_ = choose_action(obs)
        update_q_value(s, a, 0.3, 0.99, reward, s_, a_)
        s = s_
        a = a_
        
        if done:
            break


In [13]:
init_q_value()
run_episode(100)
print(q_s_a)

defaultdict(<built-in method rand of mtrand.RandomState object at 0x1117820f0>, {(9990, 0): 0.8299604730925582, (9990, 1): 0.00838829794155349, (9990, 2): 0.10644437669771933, (9990, 3): 0.29870371376938154, (9991, 0): 0.4998359116033065, (9991, 1): 1.6363825307158908, (9991, 2): 0.6424753278958413, (9991, 3): 0.7174536208124137, (9992, 0): 0.5262551673498579, (9992, 1): 1.0357526560911088, (9992, 2): 0.05195754510253359, (9992, 3): 0.176362041564184, (9993, 0): 0.17805300588353812, (9993, 1): 2.127694574275161, (9993, 2): 0.1677422287864857, (9993, 3): 0.4687777417782972, (9994, 0): 0.7996067179474019, (9994, 1): 0.5735365651986432, (9994, 2): 1.7753377842348028, (9994, 3): 0.6340543771374786, (9995, 0): 0.5332322296552158, (9995, 1): 0.2524925945574267, (9995, 2): 1.0024680983343568, (9995, 3): 0.3674387637945773, (9996, 0): 0.6508517866232056, (9996, 1): 0.312932895305152, (9996, 2): 1.5563587540242223, (9996, 3): -0.4382436192563414, (9997, 0): -0.28825634348197177, (9997, 1): 0.04

### 训练 100 次

In [14]:
def train(times, max_step):
    init_obs_state()
    init_q_value()
    for i in range(times):
        run_episode(max_step)

In [28]:
train(1000, 200)
print(q_s_a)

defaultdict(<built-in method rand of mtrand.RandomState object at 0x1117820f0>, {(0, 0): -11.376661796847293, (0, 1): -11.61972964822371, (0, 2): -12.814318956488915, (0, 3): -5.45187702915562, (1, 0): -5.525801610800291, (1, 1): -5.364543082861674, (1, 2): -2.5978326670713527, (1, 3): -5.523125944520075, (2, 0): -4.747261719381289, (2, 1): -9.676593580073186, (2, 2): -6.947584868751755, (2, 3): -8.748773650727749, (3, 0): -0.5272778370512722, (3, 1): -0.7981549625272286, (3, 2): 0.4794856758349839, (3, 3): 0.014190022870802288, (4, 0): -3.19842610360649, (4, 1): -2.536157862801441, (4, 2): -2.398959569430333, (4, 3): -3.124522885557617, (5, 0): -1.8449396416532573, (5, 1): -0.9233896791945126, (5, 2): -1.2753449100455572, (5, 3): -1.706184413288027, (6, 0): 3.499867626152042, (6, 1): 3.0319250563546496, (6, 2): 2.8326425503739, (6, 3): 4.783345134936341, (7, 0): -3.634224729339193, (7, 1): -3.077524077214208, (7, 2): -3.7908444964745724, (7, 3): 1.9378561837727986, (8, 0): 2.804293680

### 测试结果

In [16]:
def test(max_step):
    env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(env.action_space.sample())
        rewards = rewards + reward
        
        if done:
            break
    print('random action', rewards)
    
    obs = env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(choose_action(obs, True))
        rewards = rewards + reward
        
        if done:
            break
    print('td action', rewards)
    

In [66]:
test(100)

random action -185.72638764410652
td action 11.756095393186143
