# Temporal Difference

$$
\begin{align*}
&\text{Initialize Q(s,a),}\forall s \in S,a\in A(s),\text{arbitrarily, and Q(terminial-state,-) = 0} \\
&\text{Repeat (for each episode):} \\
&\quad\text{Initialize S} \\
&\quad\text{Choose A from S using policy derived from Q}(e.g.,\epsilon\,greedy) \\
&\quad\text{Repeat (for each step of episode):} \\
&\quad\quad\text{Take action A, observe R, S'} \\
&\quad\quad\text{Choose A' from S' using policy derived from Q}(e.g.,\epsilon\,greedy) \\
&\quad\quad Q(S,A) \leftarrow Q(S,A) + \alpha[R + \gamma Q(S', A') - Q(S,A)] \\
&\quad\quad S \leftarrow S'; A \leftarrow A';\\
&\quad\text{until S is terminal} \\
\end{align*}
$$

初始化 gym 环境

In [1]:
import gym
import numpy as np
np.random.seed(12345)

env = gym.make('LunarLander-v2')
env.seed(seed=12345)

print('action_space:', env.action_space)
print('observation_space:', env.observation_space)

print('action sample:', env.action_space.sample())
print('observation sample:', env.observation_space.sample())

action_space: Discrete(4)
observation_space: Box(8,)
action sample: 0
observation sample: [-1.1974306  -0.05619778 -0.02249384  1.6664321  -0.01603351  1.365675
  0.90611714 -0.21044816]


### 数据预处理

这里的 observation_space 是一个 8 维的连续值,这里使用 [Tile-Coding](./Tile-Coding.ipynb#Tile-Coding) 转变成离散值:

(0.50585246 -0.68311024  0.7191891  -0.65380216  0.14889328 -0.72178286
 -0.2422974  -0.5457825) <=> 0

In [2]:
from math import floor
def tiles(ihtORsize, numtilings, floats):
    qfloats = [floor(f * numtilings) for f in floats]
    Tiles = []
    for tiling in range(numtilings):
        tilingX2 = tiling * 2
        coords = [tiling]
        b = tiling
        for q in qfloats:
            coords.append((q + b) // numtilings)
            b += tilingX2
        Tiles.append(hash(tuple(coords)) % ihtORsize)
    return Tiles

In [3]:
print(tiles(10, 8, env.observation_space.sample()))

[3, 3, 4, 8, 5, 1, 4, 6]


定义一个 dict 用于 obs2state 的转换

In [4]:
obs_dict = {}
def init_obs_state():
    global obs_dict
    obs_dict = {}

def obs2state(obs):
    scale_obs = lambda obs: obs * 0.5
    t = tuple(tiles(10, 8, scale_obs(obs)))
    if t not in obs_dict:
        l = len(obs_dict)
        obs_dict[t] = l
    return obs_dict[t]

In [5]:
init_obs_state()
for i in range(10000):
    obs2state(env.observation_space.sample())
print(len(obs_dict))

9988


### $\epsilon$-Greedy 探索

- 最简单的方法确保持续探索
- 所有 m 个动作都采取非零概率
- $1 - \epsilon$ 概率: 选择贪婪动作
- $\epsilon$ 概率: 选择随机动作

则:
$$
\pi(a|s) = \begin{cases} \frac\epsilon m + 1 - \epsilon & (a^* == \mathop{argmax}_{a\in A} Q(s, a)) \\ \frac\epsilon m & (otherwise) \end{cases}
$$

In [6]:
def epsilon_greedy_policy(a, epsilon):
    m = len(a)
    assert m > 0, 'a length must large then 0'
    p_a = epsilon / m
    p_a_star = 1. - epsilon + p_a
    action_probs = np.ones(m) * p_a
    action_probs[np.argmax(a)] = p_a_star
    np.testing.assert_allclose(np.sum(action_probs), 1)
    act_num = np.random.multinomial(1, action_probs).argmax()
    return act_num

因为将 **observation_space** 连续值转换成了离散值, 导致 state 的数量不确定, 因此不能使用以前的初始化函数:

```
q_s_a = np.random.rand(env.observation_space.n, np.prod([act_space.n for act_space in env.action_space]))
```

改用 **collections** 中的 _defaultdict_ 来实现:

In [7]:
from collections import defaultdict

q_s_a = defaultdict(np.random.rand) 
def init_q_value():
    global q_s_a
    q_s_a = defaultdict(np.random.rand) 

print(epsilon_greedy_policy([q_s_a[(0, aa)] for aa in range(env.action_space.n)], 0.1))
print(q_s_a)

0
defaultdict(<built-in method rand of mtrand.RandomState object at 0x10e8ce320>, {(0, 0): 0.9296160928171479, (0, 1): 0.3163755545817859, (0, 2): 0.18391881167709445, (0, 3): 0.2045602785530397})


### 动作选择

In [8]:
def choose_action(obs, greedy = False):
    s = obs2state(obs)
    actions = [q_s_a[(s, aa)] for aa in range(env.action_space.n)]
    if greedy:
        return np.argmax(actions)
    else:
        return epsilon_greedy_policy(actions, 0.1)

In [9]:
print(choose_action(env.observation_space.sample()))

1


### 更新 q_value

$$
Q(S,A) \leftarrow Q(S,A) + \alpha[R + \gamma Q(S', A') - Q(S,A)]
$$

In [33]:
def update_q_value(s, a, alpha, gamma, reward, s_next, a_next):
    global q_s_a
    q_s_a[(s, a)] = q_s_a[(s, a)] + alpha * (reward + gamma * (q_s_a[(s_next, a_next)] if s_next else 0) - q_s_a[(s, a)])

### 每个 episode:

In [11]:
def run_episode(max_step):
    obs = env.reset()
    s = obs2state(obs)
    a = choose_action(obs)
    for i in range(max_step):
        obs, reward, done, info = env.step(a)
        
        s_ = obs2state(obs) if not done else None
        a_ = choose_action(obs)
        update_q_value(s, a, 0.3, 0.99, reward, s_, a_)
        s = s_
        a = a_
        
        if done:
            break


In [12]:
init_q_value()
run_episode(100)
print(q_s_a)

defaultdict(<built-in method rand of mtrand.RandomState object at 0x10e8ce320>, {(9989, 0): 0.8299604730925582, (9989, 1): 0.00838829794155349, (9989, 2): 0.10644437669771933, (9989, 3): 0.29870371376938154, (9990, 0): 0.4998359116033065, (9990, 1): 1.6363825307158908, (9990, 2): 0.6424753278958413, (9990, 3): 0.7174536208124137, (9991, 0): 0.5262551673498579, (9991, 1): 1.0357526560911088, (9991, 2): 0.05195754510253359, (9991, 3): 0.176362041564184, (9992, 0): 0.17805300588353812, (9992, 1): 2.127694574275161, (9992, 2): 0.1677422287864857, (9992, 3): 0.4687777417782972, (9993, 0): 0.7996067179474019, (9993, 1): 0.5735365651986432, (9993, 2): 1.7753377842348028, (9993, 3): 0.6340543771374786, (9994, 0): 0.5332322296552158, (9994, 1): 0.2524925945574267, (9994, 2): 1.0024680983343568, (9994, 3): 0.3674387637945773, (9995, 0): 0.6508517866232056, (9995, 1): 0.312932895305152, (9995, 2): 1.5563587540242223, (9995, 3): -0.4382436192563414, (9996, 0): -0.28825634348197177, (9996, 1): 0.04

### 训练

In [13]:
def train(times, max_step):
    init_obs_state()
    init_q_value()
    for i in range(times):
        run_episode(max_step)

In [14]:
train(1000, 200)
print(q_s_a)

defaultdict(<built-in method rand of mtrand.RandomState object at 0x10e8ce320>, {(0, 0): 1.1049949074886476, (0, 1): -1.6282365843404003, (0, 2): -1.4683815496827368, (0, 3): -1.5008555516174704, (1, 0): 1.0782225023709042, (1, 1): -0.6315432221957606, (1, 2): 1.9575609659424988, (1, 3): 5.521307369828079, (2, 0): 1.946213491736804, (2, 1): 2.1939190534352413, (2, 2): 1.9267742727512767, (2, 3): 0.014376664809967021, (3, 0): -4.063417194252826, (3, 1): -3.5198262803439446, (3, 2): -3.9343224492667623, (3, 3): -2.6410079141897693, (4, 0): 0.7065461565639968, (4, 1): 0.16137651471483694, (4, 2): 4.575838428329265, (4, 3): -0.35553778402000435, (5, 0): 1.3352323178046026, (5, 1): 4.324846651332006, (5, 2): 9.284572322812183, (5, 3): 3.77284875359588, (6, 0): 4.186842518709615, (6, 1): 4.456909294440328, (6, 2): 7.722943521404978, (6, 3): 3.5715385621330142, (7, 0): 2.2037892258604437, (7, 1): 2.28214531568568, (7, 2): 7.446243250301157, (7, 3): 2.9715090504645323, (8, 0): -0.1380822531466

### 测试结果

In [15]:
def test(max_step):
    env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(env.action_space.sample())
        rewards = rewards + reward
        
        if done:
            break
    print('random action', rewards)
    
    obs = env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(choose_action(obs, True))
        rewards = rewards + reward
        
        if done:
            break
    print('td action', rewards)
    

In [32]:
test(100)

random action -111.63659151006682
td action -2.7819548502020757
