# Dyna-Q

从真实和模拟经验中学习和规划价值函数(或策略)

$$
\begin{align*}
&\text{Initialize Q(s,a) and Model(s,a) }\forall s \in S, a \in A(s) \\
&\text{Do forever}\\
&\quad S \leftarrow \text{current (nonterminal) state}\\
&\quad A \leftarrow \epsilon\,greedy(S,Q)\\
&\quad\text{Execute action A; Observe resultant reward R and state S'}\\
&\quad Q(S, A) \leftarrow Q(S,A) + \alpha[R + \gamma\max_aQ(S', a) - Q(S, A)]\\
&\quad Model(S, A) \leftarrow R, S' \text{(assuming deterministic environment)}\\
&\quad\text{Repeat n times:}\\
&\quad\quad S \leftarrow \text{random previously observed state}\\
&\quad\quad A \leftarrow \text{random action previously taken in S}\\
&\quad\quad R, S' \leftarrow Model(S, A)\\
&\quad\quad Q(S, A) \leftarrow Q(S, A) + \alpha[R + \gamma\max_aQ(S', a) - Q(S, A)\\
\end{align*}
$$

初始化 gym 环境

In [1]:
import gym
import numpy as np
np.random.seed(12345)

env = gym.make('Taxi-v2')
env.seed(seed=12345)

print('action_space:', env.action_space)
print('observation_space:', env.observation_space)

print('action sample:', env.action_space.sample())
print('observation sample:', env.observation_space.sample())

action_space: Discrete(6)
observation_space: Discrete(500)
action sample: 0
observation sample: 315


### 创建 Model 类保存 state , action , reward 和 state_next

In [2]:
import pandas as pd

class EnvModel:
    def __init__(self, actions):
        # the simplest case is to think about the model is a memory which has all past transition information
        self.actions = actions
        self.database = pd.DataFrame(columns=actions, dtype=np.object)

    def store_transition(self, s, a, r, s_):
        if s not in self.database.index:
            self.database = self.database.append(
                pd.Series(
                    [None] * len(self.actions),
                    index=self.database.columns,
                    name=s,
                ))
        self.database.at[s, a] = (r, s_)

    def sample_s_a(self):
        s = np.random.choice(self.database.index)
        a = np.random.choice(self.database.loc[s].dropna().index)    # filter out the None value
        return s, a

    def get_r_s_(self, s, a):
        r, s_ = self.database.loc[s, a]
        return r, s_
    

envModel = EnvModel(actions=list(range(env.action_space.n)))

def init_model():
    global envModel
    envModel = EnvModel(actions=list(range(env.action_space.n)))

### $\epsilon$-Greedy 探索

- 最简单的方法确保持续探索
- 所有 m 个动作都采取非零概率
- $1 - \epsilon$ 概率: 选择贪婪动作
- $\epsilon$ 概率: 选择随机动作

则:
$$
\pi(a|s) = \begin{cases} \frac\epsilon m + 1 - \epsilon & (a^* == \mathop{argmax}_{a\in A} Q(s, a)) \\ \frac\epsilon m & (otherwise) \end{cases}
$$

In [3]:
def epsilon_greedy_policy(a, epsilon):
    m = len(a)
    assert m > 0, 'a length must large then 0'
    p_a = epsilon / m
    p_a_star = 1. - epsilon + p_a
    action_probs = np.ones(m) * p_a
    action_probs[np.argmax(a)] = p_a_star
    np.testing.assert_allclose(np.sum(action_probs), 1)
    act_num = np.random.multinomial(1, action_probs).argmax()
    return act_num

In [4]:
q_s_a = np.random.rand(env.observation_space.n, env.action_space.n)

def init_q_value():
    global q_s_a
    q_s_a = np.random.rand(env.observation_space.n, env.action_space.n)

print(q_s_a.shape)
print(epsilon_greedy_policy(q_s_a[0], 0.1))

(500, 6)
1


### 动作选择:

In [5]:
def choose_action(obs, greedy = False):
    return epsilon_greedy_policy(q_s_a[obs], 0.1) if not greedy else np.argmax(q_s_a[obs])

### 更新 q_value

$$
Q(S, A) \leftarrow Q(S,A) + \alpha[R + \gamma\max_aQ(S', a) - Q(S, A)]\\
$$

In [6]:
def update_q_value(s, a, alpha, gamma, reward, s_next):
    global q_s_a
    q_s_a[s][a] = q_s_a[s][a] + alpha * (reward + gamma * (np.max(q_s_a[s_next]) if s_next else 0) - q_s_a[s][a])

### 每个 episode:

In [7]:
def run_episode(max_step, n_times):
    s = env.reset()
    for i in range(max_step):
        a = choose_action(s)
        s_, reward, done, info = env.step(a)
        
        s_next = s_ if not done else None
        update_q_value(s, a, 0.1, 0.9, reward, s_next)
        envModel.store_transition(s, a, reward, s_next)
        
        for n in range(n_times):
            s_sim, a_sim = envModel.sample_s_a()
            r_sim, s_next_sim = envModel.get_r_s_(s_sim, a_sim)
            update_q_value(s_sim, a_sim, 0.1, 0.9, r_sim, s_next_sim)
        
        s = s_
        if done:
            break

In [8]:
init_model()
init_q_value()
print(q_s_a)
run_episode(100, 10)
print(q_s_a)

[[0.74869652 0.15671029 0.22323155 0.47084724 0.81668545 0.61839826]
 [0.20904312 0.99214169 0.54175797 0.5814712  0.93375944 0.13073146]
 [0.67290178 0.7416259  0.7656646  0.44903566 0.70600245 0.30681361]
 ...
 [0.27952323 0.26565864 0.90590145 0.57269107 0.53959644 0.07835126]
 [0.22998933 0.1997635  0.58645426 0.36092565 0.57610072 0.35979278]
 [0.77372826 0.9912789  0.33209604 0.64691804 0.28111215 0.8573636 ]]
[[ 0.74869652  0.15671029  0.22323155  0.47084724  0.81668545  0.61839826]
 [ 0.20904312  0.20283594  0.32677129  0.47565649 -0.04842492  0.13073146]
 [ 0.67290178  0.7416259   0.7656646   0.44903566  0.70600245  0.30681361]
 ...
 [ 0.27952323  0.26565864  0.90590145  0.57269107  0.53959644  0.07835126]
 [ 0.22998933  0.1997635   0.58645426  0.36092565  0.57610072  0.35979278]
 [ 0.77372826  0.9912789   0.33209604  0.64691804  0.28111215  0.8573636 ]]


### 训练

In [9]:
def train(times):
    init_model()
    init_q_value()
    for i in range(times):
        run_episode(100, 10)

In [10]:
train(100)

### 测试

In [11]:
def test(max_step):
    env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(env.action_space.sample())
        rewards = rewards + reward
        
        if done:
            break
    print('random action', rewards)
    
    obs = env.reset()
    rewards = 0
    for i in range(max_step):
        obs, reward, done, info = env.step(choose_action(obs, True))
        rewards = rewards + reward
        
        if done:
            break
    print('dyna q action', rewards)

In [33]:
test(100)

random action -388
dyna q action -100
