# Atari 游戏 SeaquestDeterministic-v4

In [1]:
%matplotlib inline
import os
import sys
import time
import itertools
import logging

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import matplotlib.pyplot as plt

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s')

2019-01-01 07:14:03,943 [DEBUG] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
# env_spec_id = 'BreakoutDeterministic-v4'
# env_spec_id = 'PongDeterministic-v4'
env_spec_id = 'SeaquestDeterministic-v4'
# env_spec_id = 'SpaceInvadersDeterministic-v4'
# env_spec_id = 'BeamRiderDeterministic-v4'
env = gym.make(env_spec_id)
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('回合最大步数 = {}'.format(env._max_episode_steps))
env.seed(0)

观测空间 = Box(210, 160, 3)
动作空间 = Discrete(18)
回合最大步数 = 100000


[0, 592379725]

### 深度 Q 网络智能体
经验回放

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['observation', 'action', 'reward',
                'next_observation', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity
    
    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)
        
    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return tuple(np.stack(self.memory.loc[indices, field]) for \
                field in self.memory.columns)

智能体

In [4]:
class DQNAgent:
    def __init__(self, env, input_shape, learning_rate=0.00025,
            load_path=None, gamma=0.99,
            replay_memory_size=1000000, batch_size=32,
            replay_start_size=0,
            epsilon=1., epsilon_decrease_rate=9e-7, min_epsilon=0.1,
            random_initial_steps=0,
            clip_reward=True, rescale_state=True,
            update_freq=1, target_network_update_freq=1):
        
        self.action_n = env.action_space.n
        self.gamma = gamma
        
        # 经验回放参数
        self.replay_memory_size = replay_memory_size
        self.replay_start_size = replay_start_size
        self.batch_size = batch_size
        self.replayer = DQNReplayer(replay_memory_size)
        
        # 图像输入参数
        self.img_shape = (input_shape[-1], input_shape[-2])
        self.img_stack = input_shape[-3]
        
        # 探索参数
        self.epsilon = epsilon
        self.epsilon_decrease_rate = epsilon_decrease_rate
        self.min_epsilon = min_epsilon
        self.random_initial_steps = random_initial_steps
        
        self.clip_reward = clip_reward
        self.rescale_state = rescale_state
        
        self.update_freq = update_freq
        self.target_network_update_freq = target_network_update_freq
        
        # 评估网络
        self.evaluate_net = self.build_network(
                input_shape=input_shape, output_size=self.action_n,
                conv_activation=tf.nn.relu,
                fc_hidden_sizes=[512,], fc_activation=tf.nn.relu,
                learning_rate=learning_rate, load_path=load_path)
        self.evaluate_net.summary() # 输出网络结构
        # 目标网络
        self.target_net = self.build_network(
                input_shape=input_shape, output_size=self.action_n,
                conv_activation=tf.nn.relu,
                fc_hidden_sizes=[512,], fc_activation=tf.nn.relu,
                )
        self.update_target_network()
        
        # 初始化计数值
        self.step = 0
        self.fit_count = 0


    def build_network(self, input_shape, output_size, conv_activation,
            fc_hidden_sizes, fc_activation, output_activation=None,
            learning_rate=0.001, load_path=None):
        # 网络输入格式: (样本, 通道, 行, 列)
        model = keras.models.Sequential()
        # tf 要求从 (样本, 通道, 行, 列) 改为 (样本, 行, 列, 通道)
        model.add(keras.layers.Permute((2, 3, 1), input_shape=input_shape))
        
        # 卷积层
        model.add(keras.layers.Conv2D(32, 8, strides=4,
                activation=conv_activation))
        model.add(keras.layers.Conv2D(64, 4, strides=2,
                activation=conv_activation))
        model.add(keras.layers.Conv2D(64, 3, strides=1,
                activation=conv_activation))
        
        model.add(keras.layers.Flatten())
        
        # 全连接层
        for hidden_size in fc_hidden_sizes:
            model.add(keras.layers.Dense(hidden_size,
                    activation=fc_activation))
        model.add(keras.layers.Dense(output_size,
                activation=output_activation))

        if load_path is not None:
            logging.info('载入网络权重 {}.'.format(load_path))
            model.load_weights(load_path)

        try: # tf2
            optimizer = keras.optimizers.RMSprop(learning_rate, 0.95,
                    momentum=0.95, epsilon=0.01)
        except: # tf1
            optimizer = tf.train.RMSPropOptimizer(learning_rate, 0.95,
                    momentum=0.95, epsilon=0.01)
        model.compile(loss=keras.losses.mse, optimizer=optimizer)
        return model
        
    def get_next_state(self, state=None, observation=None):
        img = Image.fromarray(observation, 'RGB') 
        img = img.resize(self.img_shape).convert('L') # 改大小,变灰度
        img = np.asarray(img.getdata(), dtype=np.uint8
                ).reshape(img.size[1], img.size[0]) # 转成 np.array
        
        # 堆叠图像
        if state is None:
            next_state = np.array([img,] * self.img_stack) # 初始化
        else:
            next_state = np.append(state[1:], [img,], axis=0)
        return next_state
    
    def decide(self, state, test=False, step=None):
        if step is not None and step < self.random_initial_steps:
            epsilon = 1.
        elif test:
            epsilon = 0.05
        else:
            epsilon = self.epsilon
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_n)
        else:
            if self.rescale_state:
                state = state / 128. - 1.
            q_values = self.evaluate_net.predict(state[np.newaxis])[0]
            action = np.argmax(q_values)
        return action

    def learn(self, state, action, reward, next_state, done):
        self.replayer.store(state, action, reward, next_state, done)

        self.step += 1
        
        if self.step % self.update_freq == 0 and \
                self.replayer.count >= self.replay_start_size:
            states, actions, rewards, next_states, dones = \
                    self.replayer.sample(self.batch_size) # 回放

            if self.rescale_state:
                states = states / 128. - 1.
                next_states = next_states / 128. - 1.
            if self.clip_reward:
                rewards = np.clip(rewards, -1., 1.)
            
            next_qs = self.target_net.predict(next_states)
            next_max_qs = next_qs.max(axis=-1)
            targets = self.evaluate_net.predict(states)
            targets[range(self.batch_size), actions] = rewards + \
                    self.gamma * next_max_qs * (1. - dones)

            h = self.evaluate_net.fit(states, targets, verbose=0)
            self.fit_count += 1
            
            if self.fit_count % 100 == 0:
                logging.info('训练 {}, 回合 {}, 存储大小 {}, 损失 {}' \
                        .format(self.fit_count, self.epsilon,
                        self.replayer.count, h.history['loss'][0]))
            
            if self.fit_count % self.target_network_update_freq == 0:
                self.update_target_network()
        
        # 更新 epsilon 的值：线性下降
        if self.step >= self.replay_start_size:
            self.epsilon = max(self.epsilon - self.epsilon_decrease_rate,
                               self.min_epsilon)

    def update_target_network(self): # 更新目标网络
        self.target_net.set_weights(self.evaluate_net.get_weights())
        logging.info('目标网络已更新')

    def save_network(self, path): # 保存网络
        dirname = os.path.dirname(save_path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            logging.info('创建文件夹 {}'.format(dirname))
        self.evaluate_net.save_weights(path)
        logging.info('网络权重已保存 {}'.format(path))

测试

In [5]:
def test(env, agent, episodes=100, render=False, verbose=True):
    steps, episode_rewards = [], []
    for episode in range(episodes):
        episode_reward = 0
        observation = env.reset()
        state = agent.get_next_state(None, observation)
        for step in itertools.count():
            if render:
                env.render()
            action = agent.decide(state, test=True, step=step)
            observation, reward, done, info = env.step(action)
            state = agent.get_next_state(state, observation)
            episode_reward += reward
            if done:
                break
        step += 1
        steps.append(step)
        episode_rewards.append(episode_reward)
        logging.info('[测试] 回合 {}: 步骤 {}, 奖励 {}, 步数 {}'
                .format(episode, step, episode_reward, np.sum(steps)))
            
    if verbose:
        logging.info('[测试小结] 步数: 平均 = {}, 最小 = {}, 最大 = {}.' \
                .format(np.mean(steps), np.min(steps), np.max(steps)))
        logging.info('[测试小结] 奖励: 平均 = {}, 最小 = {}, 最大 = {}' \
                .format(np.mean(episode_rewards), np.min(episode_rewards),
                np.max(episode_rewards)))
    return episode_rewards

参数设置

In [6]:
render = False
load_path = None
save_path = './output/' + env.unwrapped.spec.id + '-' + \
        time.strftime('%Y%m%d-%H%M%S') + '/model.h5'

"""
Nature 文章使用的参数, 运行极慢, 请勿轻易尝试
"""
input_shape = (4, 110, 84) # 输入网络大小
batch_size = 32
replay_memory_size = 1000000
target_network_update_freq = 10000
gamma = 0.99
update_freq = 4 # 训练网络的间隔
learning_rate = 0.00025 # 优化器学习率
epsilon = 1. # 初始探索率
min_epsilon = 0.1 # 最终探索率
epsilon_decrease = 9e-7 # 探索减小速度
replay_start_size = 50000 # 开始训练前的经验数
random_initial_steps = 30 # 每个回合开始时随机步数
frames = 50000000 # 整个算法的总训练步数
test_freq = 50000 # 验证智能体的步数间隔
test_episodes = 100 # 每次验证智能体的回合数


"""
小规模参数, 运行时间数小时, 有一点点训练效果
"""
batch_size = 32
replay_memory_size = 50000
target_network_update_freq = 4000
replay_start_size = 10000
random_initial_steps = 30
frames = 100000
test_freq = 25000
test_episodes = 50


# """
# 超小规模参数, 数分钟即可运行完, 基本没有训练效果
# """
# batch_size = 6
# replay_memory_size = 5000
# target_network_update_freq = 80
# replay_start_size = 500
# random_initial_steps = 30
# frames = 7500
# test_freq = 2500
# test_episodes = 10

训练

In [7]:
agent = DQNAgent(env, input_shape=input_shape, batch_size=batch_size,
        replay_memory_size=replay_memory_size,
        learning_rate=learning_rate, gamma=gamma,
        epsilon=epsilon, epsilon_decrease_rate=epsilon_decrease,
        min_epsilon=min_epsilon, random_initial_steps=random_initial_steps,
        load_path=load_path,
        update_freq=update_freq,
        target_network_update_freq=target_network_update_freq)

logging.info("训练开始")

frame = 0
max_mean_episode_reward = float("-inf")
for episode in itertools.count():
    observation = env.reset()
    episode_reward = 0
    state = agent.get_next_state(None, observation)
    for step in itertools.count():
        if render:
            env.render()
        frame += 1
        action = agent.decide(state, step=step)
        observation, reward, done, _ = env.step(action)
        next_state = agent.get_next_state(state, observation)
        episode_reward += reward
        agent.learn(state, action, reward, next_state, done)
        
        # 验证
        if frame % test_freq == 0 or \
                (done and (frame + 1) % test_freq == 0):
            test_episode_rewards = test(env=env,
                    agent=agent, episodes=test_episodes, render=render)
            if max_mean_episode_reward < np.mean(test_episode_rewards):
                max_mean_episode_reward = np.mean(test_episode_rewards)
                agent.save_network(save_path)
                path = save_path[:-2] + str(agent.fit_count) + '.h5'
                agent.save_network(path)
        
        if done:
            step += 1
            frame += 1
            break
        state = next_state
    
    logging.info("回合 {}, 步数 {}, 奖励 {}, 总步数 {}".format(
            episode, step, episode_reward, frame))
    
    if frame > frames:
        break

logging.info("训练结束")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 110, 84, 4)        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 12, 9, 64)         32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 10, 7, 64)         36928     
_________________________________________________________________
flatten (Flatten)            (None, 4480)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               2294272   
_________________________________________________________________
dense_1 (Dense)              (None, 18)                9234      
Total para

2019-01-01 08:00:38,195 [INFO] 回合 51, 步数 507, 奖励 60.0, 总步数 27305
2019-01-01 08:01:10,460 [INFO] 训练 6900, 回合 0.9751609000008178, 存储大小 27600, 损失 0.001699072658084333
2019-01-01 08:01:26,647 [INFO] 回合 52, 步数 523, 奖励 80.0, 总步数 27829
2019-01-01 08:01:47,928 [INFO] 训练 7000, 回合 0.9748009000008296, 存储大小 28000, 损失 0.0017145485617220402
2019-01-01 08:02:05,888 [INFO] 回合 53, 步数 417, 奖励 40.0, 总步数 28247
2019-01-01 08:02:25,051 [INFO] 训练 7100, 回合 0.9744409000008415, 存储大小 28400, 损失 0.00018454679229762405
2019-01-01 08:02:52,496 [INFO] 回合 54, 步数 496, 奖励 40.0, 总步数 28744
2019-01-01 08:03:02,779 [INFO] 训练 7200, 回合 0.9740809000008533, 存储大小 28800, 损失 1.7928719898918644e-05
2019-01-01 08:03:32,295 [INFO] 回合 55, 步数 429, 奖励 40.0, 总步数 29174
2019-01-01 08:03:40,227 [INFO] 训练 7300, 回合 0.9737209000008652, 存储大小 29200, 损失 9.458746717427857e-06
2019-01-01 08:04:13,942 [INFO] 回合 56, 步数 435, 奖励 20.0, 总步数 29610
2019-01-01 08:04:18,484 [INFO] 训练 7400, 回合 0.973360900000877, 存储大小 29600, 损失 0.0017526044975966215
2019-01-01

2019-01-01 08:40:03,223 [INFO] 回合 93, 步数 594, 奖励 120.0, 总步数 49065
2019-01-01 08:40:33,839 [INFO] 训练 12300, 回合 0.9557209000014578, 存储大小 49200, 损失 1.6057316315709613e-05
2019-01-01 08:41:25,515 [INFO] 训练 12400, 回合 0.9553609000014697, 存储大小 49600, 损失 1.7378642951371148e-05
2019-01-01 08:41:32,166 [INFO] 回合 94, 步数 679, 奖励 100.0, 总步数 49745
2019-01-01 08:42:27,828 [INFO] [测试] 回合 0: 步骤 522, 奖励 140.0, 步数 522
2019-01-01 08:43:15,044 [INFO] [测试] 回合 1: 步骤 1066, 奖励 320.0, 步数 1588
2019-01-01 08:43:46,848 [INFO] [测试] 回合 2: 步骤 707, 奖励 180.0, 步数 2295
2019-01-01 08:44:12,806 [INFO] [测试] 回合 3: 步骤 589, 奖励 140.0, 步数 2884
2019-01-01 08:44:32,024 [INFO] [测试] 回合 4: 步骤 539, 奖励 120.0, 步数 3423
2019-01-01 08:45:00,330 [INFO] [测试] 回合 5: 步骤 805, 奖励 200.0, 步数 4228
2019-01-01 08:45:33,847 [INFO] [测试] 回合 6: 步骤 749, 奖励 180.0, 步数 4977
2019-01-01 08:46:04,012 [INFO] [测试] 回合 7: 步骤 676, 奖励 180.0, 步数 5653
2019-01-01 08:47:00,370 [INFO] [测试] 回合 8: 步骤 1259, 奖励 300.0, 步数 6912
2019-01-01 08:47:54,540 [INFO] [测试] 回合 9: 步骤 1207, 

2019-01-01 09:40:18,206 [INFO] 回合 115, 步数 624, 奖励 160.0, 总步数 60631
2019-01-01 09:40:56,892 [INFO] 训练 15200, 回合 0.9452809000018015, 存储大小 50000, 损失 0.0018040966242551804
2019-01-01 09:41:34,635 [INFO] 回合 116, 步数 563, 奖励 100.0, 总步数 61195
2019-01-01 09:41:51,421 [INFO] 训练 15300, 回合 0.9449209000018134, 存储大小 50000, 损失 0.0017157591646537185
2019-01-01 09:42:43,687 [INFO] 训练 15400, 回合 0.9445609000018252, 存储大小 50000, 损失 1.3229073374532163e-05
2019-01-01 09:42:55,382 [INFO] 回合 117, 步数 609, 奖励 140.0, 总步数 61805
2019-01-01 09:43:37,855 [INFO] 训练 15500, 回合 0.9442009000018371, 存储大小 50000, 损失 9.710726772027556e-06
2019-01-01 09:43:50,650 [INFO] 回合 118, 步数 409, 奖励 60.0, 总步数 62215
2019-01-01 09:44:31,759 [INFO] 训练 15600, 回合 0.943840900001849, 存储大小 50000, 损失 2.3481705284211785e-05
2019-01-01 09:44:56,408 [INFO] 回合 119, 步数 487, 奖励 120.0, 总步数 62703
2019-01-01 09:45:26,179 [INFO] 训练 15700, 回合 0.9434809000018608, 存储大小 50000, 损失 1.0584591109363828e-05
2019-01-01 09:46:20,846 [INFO] 训练 15800, 回合 0.943120900001

2019-01-01 10:27:07,081 [INFO] [测试] 回合 40: 步骤 328, 奖励 20.0, 步数 19323
2019-01-01 10:27:26,288 [INFO] [测试] 回合 41: 步骤 410, 奖励 60.0, 步数 19733
2019-01-01 10:27:45,515 [INFO] [测试] 回合 42: 步骤 435, 奖励 60.0, 步数 20168
2019-01-01 10:28:09,252 [INFO] [测试] 回合 43: 步骤 538, 奖励 80.0, 步数 20706
2019-01-01 10:28:22,799 [INFO] [测试] 回合 44: 步骤 328, 奖励 0.0, 步数 21034
2019-01-01 10:28:43,268 [INFO] [测试] 回合 45: 步骤 459, 奖励 60.0, 步数 21493
2019-01-01 10:29:00,666 [INFO] [测试] 回合 46: 步骤 418, 奖励 40.0, 步数 21911
2019-01-01 10:29:17,009 [INFO] [测试] 回合 47: 步骤 403, 奖励 0.0, 步数 22314
2019-01-01 10:29:38,776 [INFO] [测试] 回合 48: 步骤 507, 奖励 60.0, 步数 22821
2019-01-01 10:29:58,297 [INFO] [测试] 回合 49: 步骤 477, 奖励 0.0, 步数 23298
2019-01-01 10:29:58,299 [INFO] [测试小结] 步数: 平均 = 465.96, 最小 = 326, 最大 = 734.
2019-01-01 10:29:58,302 [INFO] [测试小结] 奖励: 平均 = 34.0, 最小 = 0.0, 最大 = 120.0
2019-01-01 10:29:58,333 [INFO] 回合 142, 步数 168, 奖励 0.0, 总步数 75002
2019-01-01 10:30:37,557 [INFO] 训练 18800, 回合 0.9323209000022282, 存储大小 50000, 损失 8.581890142522752e-0

2019-01-01 11:11:15,904 [INFO] 训练 23700, 回合 0.914680900002809, 存储大小 50000, 损失 1.3763721653958783e-05
2019-01-01 11:11:33,484 [INFO] 回合 178, 步数 602, 奖励 80.0, 总步数 95113
2019-01-01 11:12:07,915 [INFO] 训练 23800, 回合 0.9143209000028208, 存储大小 50000, 损失 0.00019114372844342142
2019-01-01 11:12:54,305 [INFO] 回合 179, 步数 619, 奖励 120.0, 总步数 95733
2019-01-01 11:13:00,521 [INFO] 训练 23900, 回合 0.9139609000028327, 存储大小 50000, 损失 1.2359452739474364e-05
2019-01-01 11:13:53,473 [INFO] 训练 24000, 回合 0.9136009000028446, 存储大小 50000, 损失 1.3050831512373406e-05
2019-01-01 11:13:53,536 [INFO] 目标网络已更新
2019-01-01 11:14:27,230 [INFO] 回合 180, 步数 703, 奖励 100.0, 总步数 96437
2019-01-01 11:14:46,153 [INFO] 训练 24100, 回合 0.9132409000028564, 存储大小 50000, 损失 1.308241280639777e-05
2019-01-01 11:15:40,715 [INFO] 训练 24200, 回合 0.9128809000028683, 存储大小 50000, 损失 2.701804623939097e-05
2019-01-01 11:15:43,905 [INFO] 回合 181, 步数 570, 奖励 120.0, 总步数 97008
2019-01-01 11:16:35,659 [INFO] 训练 24300, 回合 0.9125209000028801, 存储大小 50000, 损失 0.0018

测试

In [8]:
test_agent = DQNAgent(env, input_shape=input_shape, load_path=save_path)
test_episode_rewards = test(env, test_agent, episodes=test_episodes)
print('平均回合奖励 = {}'.format(np.mean(test_episode_rewards)))

2019-01-01 11:50:05,601 [INFO] 载入网络权重 ./output/SeaquestDeterministic-v4-20190124-071404/model.h5.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_2 (Permute)          (None, 110, 84, 4)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 26, 20, 32)        8224      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 12, 9, 64)         32832     
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 10, 7, 64)         36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               2294272   
____________________________________________