# Monte-Carlo

创建 gym 环境

In [1]:
import gym
import numpy as np
np.random.seed(12345)

env = gym.make('Copy-v0')
env.seed(seed=12345)

print('action_space', env.action_space)
print('observation_space', env.observation_space)

print(env.action_space.sample())
print(env.observation_space.sample())

action_space Tuple(Discrete(2), Discrete(2), Discrete(5))
observation_space Discrete(6)
(1, 1, 3)
2


这里的 action_space 是一个 3 维的离散值,这里需要做一个转换:

(0,0,0) <=> 0   
(0,0,1) <=> 1   
...   
(1, 1, 4) <=> 19   

In [2]:
from itertools import product
action_list = list(product(*[range(i) for i in [act_space.n for act_space in env.action_space]]))
def num2action(num):
    return action_list[num]

num_action_dict = {act: i for i, act in enumerate(action_list)}
def action2num(action):
    return num_action_dict[action]

### $\epsilon$-Greedy 探索

- 最简单的方法确保持续探索
- 所有 m 个动作都采取非零概率
- $1 - \epsilon$ 概率: 选择贪婪动作
- $\epsilon$ 概率: 选择随机动作

则:
$$
\pi(a|s) = \begin{cases} \frac\epsilon m + 1 - \epsilon & (a^* == \mathop{argmax}_{a\in A} Q(s, a)) \\ \frac\epsilon m & (otherwise) \end{cases}
$$

In [3]:
def epsilon_greedy_policy(a, epsilon):
    m = len(a)
    assert m > 0, 'a length must large then 0'
    p_a = epsilon / m
    p_a_star = 1. - epsilon + p_a
    action_probs = np.ones(m) * p_a
    action_probs[a.argmax()] = p_a_star
    np.testing.assert_allclose(np.sum(action_probs), 1)
    act_num = np.random.multinomial(1, action_probs).argmax()
    return num2action(act_num)

In [4]:
q_s_a = np.random.rand(env.observation_space.n, np.prod([act_space.n for act_space in env.action_space]))
print(q_s_a.shape)
print(epsilon_greedy_policy(q_s_a[0], 0.1))

(6, 20)
(1, 1, 3)


In [5]:
def choose_action(state):
    return epsilon_greedy_policy(q_s_a[state], 0.1)

### 使用 $\epsilon$-Greedy 完成 1 个 episode:

  $$
  \{S_1, A_1, R_2, ..., S_T\} \sim \pi
  $$

In [6]:
def run_episode(max_step):
    history_s_a = []
    history_reward = []
    
    s = env.reset()
    for i in range(max_step):
        action = choose_action(s)
        history_s_a.append((s, action2num(action)))
        
        s, reward, done, info = env.step(action)
        history_reward.append(reward)
        
        if done:
            break
    
    return history_s_a, history_reward

In [7]:
print(run_episode(100))

([(2, 13), (2, 13), (2, 13), (2, 1), (2, 13), (2, 13), (5, 15)], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.5])


### 每个 episode 中的状态 $S_t$ 和动作 $A_t$:
  $$
  \begin{align*}
  & N(S_t, A_t) = N(S_t, A_t) + 1 \\
  & Q(S_t, A_t) = Q(S_t, A_t) + \frac{1}{N(S_t, A_t)}(G_t - Q(S_t, A_t))
  \end{align*}
  $$

In [8]:
def update_q_value(history_s_a, history_reward, n_s_a):
    set_s_a = set(history_s_a)
    g_t = [np.sum(history_reward[i:]) for i in [history_s_a.index(sa) for sa in set_s_a]]
    for i, (s, a) in enumerate(set_s_a):
        n_s_a[s][a] = n_s_a[s][a] + 1
        q_s_a[s][a] = q_s_a[s][a] + 1 / n_s_a[s][a] * (g_t[i] - q_s_a[s][a])

In [9]:
n_s_a = np.zeros((env.observation_space.n, np.prod([act_space.n for act_space in env.action_space])))
history_s_a, history_reward = run_episode(100)
update_q_value(history_s_a, history_reward, n_s_a)

### 训练 10000 个 episodes:

In [10]:
n_s_a = np.zeros((env.observation_space.n, np.prod([act_space.n for act_space in env.action_space])))
for i in range(10000):
    history_s_a, history_reward = run_episode(100)
    update_q_value(history_s_a, history_reward, n_s_a)
    
print(n_s_a)
print(q_s_a)

[[6.800e+01 6.100e+01 5.700e+01 6.200e+01 5.400e+01 1.260e+02 7.100e+01
  4.500e+01 6.200e+01 6.200e+01 7.100e+01 7.300e+01 5.200e+01 8.000e+01
  4.900e+01 6.279e+03 5.700e+01 5.100e+01 5.900e+01 6.900e+01]
 [6.800e+01 6.400e+01 5.900e+01 7.000e+01 6.000e+01 6.200e+01 4.800e+01
  7.200e+01 5.800e+01 6.000e+01 6.200e+01 7.400e+01 5.100e+01 7.400e+01
  5.200e+01 6.300e+01 6.286e+03 8.000e+01 5.500e+01 7.000e+01]
 [6.900e+01 7.000e+01 8.400e+01 6.100e+01 6.100e+01 7.000e+01 7.500e+01
  1.350e+02 6.400e+01 6.900e+01 6.200e+01 6.100e+01 5.800e+01 5.700e+01
  5.900e+01 4.600e+01 7.500e+01 6.203e+03 4.700e+01 5.100e+01]
 [6.700e+01 6.900e+01 5.800e+01 5.400e+01 7.000e+01 5.400e+01 6.200e+01
  6.100e+01 8.600e+01 5.800e+01 5.500e+01 5.200e+01 7.200e+01 5.300e+01
  6.200e+01 5.400e+01 6.900e+01 6.300e+01 6.224e+03 5.400e+01]
 [5.400e+01 7.600e+01 6.600e+01 5.800e+01 6.700e+01 6.500e+01 6.400e+01
  6.400e+01 6.300e+01 7.300e+01 5.400e+01 6.900e+01 6.100e+01 6.800e+01
  4.600e+01 5.500e+01 6.200e

In [11]:
print(run_episode(100))

([(2, 17), (2, 17), (4, 19), (1, 16), (0, 15), (0, 15), (4, 19), (4, 19)], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
