# CrossEntropy

初始化 gym 环境

In [1]:
import gym
import numpy as np
np.random.seed(12345)

env = gym.make('LunarLander-v2')
env.seed(seed=12345)

print('action_space:', env.action_space)
print('observation_space:', env.observation_space)

print('action sample:', env.action_space.sample())
print('observation sample:', env.observation_space.sample())

action_space: Discrete(4)
observation_space: Box(8,)
action sample: 2
observation sample: [ 1.1734517  -0.73084885  2.3164437   1.2350931  -0.19862553  0.8295835
 -1.9955839   0.2850155 ]


定义 $\theta$ :
$$
\theta_i  =  \{\mathbf{W}_i^{(t)}, \mathbf{b}_i^{(t)} \} \\
$$

In [2]:
theta_w_size = env.observation_space.shape[0] * env.action_space.n
theta_b_size = env.action_space.n
theta_size = theta_w_size + theta_b_size

def init_theta():
    global theta_mu
    global theta_sigma
    theta_mu = np.random.rand(theta_size)
    theta_sigma = np.ones(theta_size)

init_theta()
print(theta_mu)
print(theta_sigma)

[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
 0.96451452 0.6531771  0.74890664 0.65356987 0.74771481 0.96130674
 0.0083883  0.10644438 0.29870371 0.65641118 0.80981255 0.87217591
 0.9646476  0.72368535 0.64247533 0.71745362 0.46759901 0.32558468
 0.43964461 0.72968908 0.99401459 0.67687371 0.79082252 0.17091426
 0.02684928 0.80037024 0.90372254 0.02467621 0.49174732 0.52625517]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### 1. 产生 num 个基于 ($\mu, \Sigma$) 分布的 $\theta$ :
$$
\theta_i  \sim  \mathcal{N}(\mu^{(t)}, \Sigma^{(t)})
$$

In [3]:
def generate_theta(num):
    return np.random.multivariate_normal(theta_mu, np.diag(theta_sigma), num)

In [4]:
print(generate_theta(500))

[[ 0.35252879  0.44049683  0.48653237 ...  0.04686081  1.25011046
  -0.13426916]
 [ 1.79219618  0.30634365  0.23392817 ...  2.02837357  0.52135747
   1.32150832]
 [ 1.04772585 -0.43215599  0.76888855 ...  0.00766917 -0.73239797
  -1.27458475]
 ...
 [ 0.11282635  1.61699009 -0.38934844 ... -0.27681193  1.36009526
   1.36990173]
 [-0.39932985  0.56188765  0.08862742 ... -1.51137967  0.35013735
   0.95123173]
 [ 0.8702279  -0.94310184 -0.89758871 ... -0.68211973  1.60451904
   0.61878111]]


### 2. 基于抽样的 $\theta$ 中的 W 和 b 对 observation 作出行动

$$
\mathbf{z}_i = \text{obs} \cdot \mathbf{W}_i + \mathbf{b}_i \\
p(a_i^{(t + 1)})  =  \frac{e^{\mathbf{z}_i}}{\sum_j e^{z_{ij}}}  \\
a^{(t + 1)}  =  \arg \max_j p(a_j^{(t+1)})
$$

In [5]:
def act(obs, theta):
    W = theta[: theta_w_size].reshape(env.observation_space.shape[0], env.action_space.n)
    b = theta[theta_w_size:]
    z = obs @ W + b
    p_a = np.exp(z) / np.sum(np.exp(z))
    return p_a.argmax()

In [6]:
print(act(env.observation_space.sample(), generate_theta(1)[0]))

2


### 3. 根据获取的 reward 排序,取抽样 $\theta$ top n 来更新 $\theta$
$$
            \mu^{(t+1)}  =  \text{avg}(\texttt{best_thetas}^{(t)}) \\
            \Sigma^{(t+1)}  =  \text{var}(\texttt{best_thetas}^{(t)})
$$

In [7]:
def update_theta(rewards, theta_samples, top_n):
    ranks = np.argsort(rewards)[::-1]
    global theta_mu
    global theta_sigma
    theta_mu = np.mean(theta_samples[ranks[:top_n]], axis = 0)
    theta_sigma = np.var(theta_samples[ranks[:top_n]], axis = 0)

In [8]:
update_theta(np.random.randn(100), generate_theta(100), 30)
print(theta_mu)
print(theta_sigma)

[ 0.92300045  0.39735124 -0.0920831   0.14242817  0.67636063  0.87452501
  1.01921441  0.62146951  0.78846477  0.86587383  0.34293731  1.1085286
  0.03555756 -0.07000202  0.63632119  0.7508391   0.93475531  0.66595909
  1.0139726   0.34905225  0.85527046  0.99458628  0.30325182  0.32247557
  0.43296203  0.59528167  0.84574627  0.77140558  0.87668468 -0.16143913
  0.35304345  0.95974224  0.87592094  0.07070885  0.5045008   0.47535789]
[0.58899074 0.89206837 1.12078998 0.94421964 0.83867856 1.0240016
 0.60023817 0.6815115  0.99665765 0.86677778 1.21293502 0.61111779
 0.93529723 0.74661631 1.29766816 1.28355831 0.72910956 0.62162613
 1.08281604 1.03923427 1.3068232  1.15538615 0.68602998 1.01640456
 0.77232201 0.86430182 1.02091305 0.77309621 1.54108435 1.0795563
 1.06339939 1.05233931 0.97429413 0.71725636 0.93071046 0.98550216]


---
## 在每个 episode 中基于给定的 $\theta$ 计算奖励

In [9]:
def episode(max_step, theta):
    obs = env.reset()
    total_reward = 0
    for i in range(max_step):
        a = act(obs, theta)
        obs, reward, done, info = env.step(a)
        total_reward = total_reward + reward
        if done:
            break
    return total_reward

In [10]:
print(episode(100, generate_theta(1)[0]))

-182.01952729832269


### 统计 num 个 $\theta$ 

In [11]:
def run_episode(max_step, num, percent):
    rewards = []
    thetas = generate_theta(num)
    for t in thetas:
        rewards.append(episode(max_step, t))
    update_theta(rewards, thetas, int(num * percent))

In [12]:
init_theta()
print(theta_mu)
print(theta_sigma)
run_episode(100, 100, 0.3)
print(theta_mu)
print(theta_sigma)

[0.39105229 0.33700916 0.39288561 0.93504311 0.87138117 0.35151889
 0.32928704 0.1288713  0.7653364  0.30691525 0.86515211 0.24593486
 0.66560529 0.43184597 0.23778706 0.48015849 0.90004859 0.151403
 0.4172862  0.84758819 0.28995678 0.99159366 0.78188259 0.182274
 0.96475878 0.65636505 0.35199606 0.82352041 0.98768636 0.01190473
 0.83616786 0.09013997 0.81979493 0.0856294  0.82102356 0.37369443]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[ 0.72269626  0.41859999  0.36597012  0.84772614  1.45241079  0.28702371
  0.06688839 -0.02115669  1.00589196  0.52202164  0.94974801 -0.05372265
  0.74900946  0.68544614  0.26640543  0.64872813  0.8390039   0.05673413
  0.30463409  1.10742726  0.3576464   0.99990112  1.02629469  0.58763278
  1.23125389  0.51131915  0.51875215  0.59345194  1.0823635   0.01459871
  0.59603492  0.31340022  1.04802385  0.03962114  0.60653672  0.10317206]
[1.14389424 0.8845821  1.64856519 0.81602161 0.8384

### 训练 100 个 episodes :

In [32]:
def train():
    init_theta()
    for i in range(30):
        run_episode(100, 100, 0.3)

In [33]:
train()

### 测试训练结果

In [16]:
def test():
    obs = env.reset()
    total_reward = 0
    for i in range(100):
        obs, r, done, info = env.step(env.action_space.sample())
        total_reward = total_reward + r
        if done:
            break
    return total_reward

In [65]:
print(test())
print(episode(100, generate_theta(1)[0]))

-59.33493326628151
91.34635743806807
