# Atari 游戏 PongDeterministic-v4

In [1]:
%matplotlib inline
import os
import sys
import time
import itertools
import logging

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import matplotlib.pyplot as plt

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s')

2019-01-01 07:13:21,998 [DEBUG] Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [2]:
# env_spec_id = 'BreakoutDeterministic-v4'
env_spec_id = 'PongDeterministic-v4'
# env_spec_id = 'SeaquestDeterministic-v4'
# env_spec_id = 'SpaceInvadersDeterministic-v4'
# env_spec_id = 'BeamRiderDeterministic-v4'
env = gym.make(env_spec_id)
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('回合最大步数 = {}'.format(env._max_episode_steps))
env.seed(0)

观测空间 = Box(210, 160, 3)
动作空间 = Discrete(6)
回合最大步数 = 100000


[0, 592379725]

### 深度 Q 网络智能体
经验回放

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['observation', 'action', 'reward',
                'next_observation', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity
    
    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)
        
    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return tuple(np.stack(self.memory.loc[indices, field]) for \
                field in self.memory.columns)

智能体

In [4]:
class DQNAgent:
    def __init__(self, env, input_shape, learning_rate=0.00025,
            load_path=None, gamma=0.99,
            replay_memory_size=1000000, batch_size=32,
            replay_start_size=0,
            epsilon=1., epsilon_decrease_rate=9e-7, min_epsilon=0.1,
            random_initial_steps=0,
            clip_reward=True, rescale_state=True,
            update_freq=1, target_network_update_freq=1):
        
        self.action_n = env.action_space.n
        self.gamma = gamma
        
        # 经验回放参数
        self.replay_memory_size = replay_memory_size
        self.replay_start_size = replay_start_size
        self.batch_size = batch_size
        self.replayer = DQNReplayer(replay_memory_size)
        
        # 图像输入参数
        self.img_shape = (input_shape[-1], input_shape[-2])
        self.img_stack = input_shape[-3]
        
        # 探索参数
        self.epsilon = epsilon
        self.epsilon_decrease_rate = epsilon_decrease_rate
        self.min_epsilon = min_epsilon
        self.random_initial_steps = random_initial_steps
        
        self.clip_reward = clip_reward
        self.rescale_state = rescale_state
        
        self.update_freq = update_freq
        self.target_network_update_freq = target_network_update_freq
        
        # 评估网络
        self.evaluate_net = self.build_network(
                input_shape=input_shape, output_size=self.action_n,
                conv_activation=tf.nn.relu,
                fc_hidden_sizes=[512,], fc_activation=tf.nn.relu,
                learning_rate=learning_rate, load_path=load_path)
        self.evaluate_net.summary() # 输出网络结构
        # 目标网络
        self.target_net = self.build_network(
                input_shape=input_shape, output_size=self.action_n,
                conv_activation=tf.nn.relu,
                fc_hidden_sizes=[512,], fc_activation=tf.nn.relu,
                )
        self.update_target_network()
        
        # 初始化计数值
        self.step = 0
        self.fit_count = 0


    def build_network(self, input_shape, output_size, conv_activation,
            fc_hidden_sizes, fc_activation, output_activation=None,
            learning_rate=0.001, load_path=None):
        # 网络输入格式: (样本, 通道, 行, 列)
        model = keras.models.Sequential()
        # tf 要求从 (样本, 通道, 行, 列) 改为 (样本, 行, 列, 通道)
        model.add(keras.layers.Permute((2, 3, 1), input_shape=input_shape))
        
        # 卷积层
        model.add(keras.layers.Conv2D(32, 8, strides=4,
                activation=conv_activation))
        model.add(keras.layers.Conv2D(64, 4, strides=2,
                activation=conv_activation))
        model.add(keras.layers.Conv2D(64, 3, strides=1,
                activation=conv_activation))
        
        model.add(keras.layers.Flatten())
        
        # 全连接层
        for hidden_size in fc_hidden_sizes:
            model.add(keras.layers.Dense(hidden_size,
                    activation=fc_activation))
        model.add(keras.layers.Dense(output_size,
                activation=output_activation))

        if load_path is not None:
            logging.info('载入网络权重 {}.'.format(load_path))
            model.load_weights(load_path)

        try: # tf2
            optimizer = keras.optimizers.RMSprop(learning_rate, 0.95,
                    momentum=0.95, epsilon=0.01)
        except: # tf1
            optimizer = tf.train.RMSPropOptimizer(learning_rate, 0.95,
                    momentum=0.95, epsilon=0.01)
        model.compile(loss=keras.losses.mse, optimizer=optimizer)
        return model
        
    def get_next_state(self, state=None, observation=None):
        img = Image.fromarray(observation, 'RGB') 
        img = img.resize(self.img_shape).convert('L') # 改大小,变灰度
        img = np.asarray(img.getdata(), dtype=np.uint8
                ).reshape(img.size[1], img.size[0]) # 转成 np.array
        
        # 堆叠图像
        if state is None:
            next_state = np.array([img,] * self.img_stack) # 初始化
        else:
            next_state = np.append(state[1:], [img,], axis=0)
        return next_state
    
    def decide(self, state, test=False, step=None):
        if step is not None and step < self.random_initial_steps:
            epsilon = 1.
        elif test:
            epsilon = 0.05
        else:
            epsilon = self.epsilon
        if np.random.rand() < epsilon:
            action = np.random.choice(self.action_n)
        else:
            if self.rescale_state:
                state = state / 128. - 1.
            q_values = self.evaluate_net.predict(state[np.newaxis])[0]
            action = np.argmax(q_values)
        return action

    def learn(self, state, action, reward, next_state, done):
        self.replayer.store(state, action, reward, next_state, done)

        self.step += 1
        
        if self.step % self.update_freq == 0 and \
                self.replayer.count >= self.replay_start_size:
            states, actions, rewards, next_states, dones = \
                    self.replayer.sample(self.batch_size) # 回放

            if self.rescale_state:
                states = states / 128. - 1.
                next_states = next_states / 128. - 1.
            if self.clip_reward:
                rewards = np.clip(rewards, -1., 1.)
            
            next_qs = self.target_net.predict(next_states)
            next_max_qs = next_qs.max(axis=-1)
            targets = self.evaluate_net.predict(states)
            targets[range(self.batch_size), actions] = rewards + \
                    self.gamma * next_max_qs * (1. - dones)

            h = self.evaluate_net.fit(states, targets, verbose=0)
            self.fit_count += 1
            
            if self.fit_count % 100 == 0:
                logging.info('训练 {}, 回合 {}, 存储大小 {}, 损失 {}' \
                        .format(self.fit_count, self.epsilon,
                        self.replayer.count, h.history['loss'][0]))
            
            if self.fit_count % self.target_network_update_freq == 0:
                self.update_target_network()
        
        # 更新 epsilon 的值：线性下降
        if self.step >= self.replay_start_size:
            self.epsilon = max(self.epsilon - self.epsilon_decrease_rate,
                               self.min_epsilon)

    def update_target_network(self): # 更新目标网络
        self.target_net.set_weights(self.evaluate_net.get_weights())
        logging.info('目标网络已更新')

    def save_network(self, path): # 保存网络
        dirname = os.path.dirname(save_path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            logging.info('创建文件夹 {}'.format(dirname))
        self.evaluate_net.save_weights(path)
        logging.info('网络权重已保存 {}'.format(path))

测试

In [5]:
def test(env, agent, episodes=100, render=False, verbose=True):
    steps, episode_rewards = [], []
    for episode in range(episodes):
        episode_reward = 0
        observation = env.reset()
        state = agent.get_next_state(None, observation)
        for step in itertools.count():
            if render:
                env.render()
            action = agent.decide(state, test=True, step=step)
            observation, reward, done, info = env.step(action)
            state = agent.get_next_state(state, observation)
            episode_reward += reward
            if done:
                break
        step += 1
        steps.append(step)
        episode_rewards.append(episode_reward)
        logging.info('[测试] 回合 {}: 步骤 {}, 奖励 {}, 步数 {}'
                .format(episode, step, episode_reward, np.sum(steps)))
            
    if verbose:
        logging.info('[测试小结] 步数: 平均 = {}, 最小 = {}, 最大 = {}.' \
                .format(np.mean(steps), np.min(steps), np.max(steps)))
        logging.info('[测试小结] 奖励: 平均 = {}, 最小 = {}, 最大 = {}' \
                .format(np.mean(episode_rewards), np.min(episode_rewards),
                np.max(episode_rewards)))
    return episode_rewards

参数设置

In [6]:
render = False
load_path = None
save_path = './output/' + env.unwrapped.spec.id + '-' + \
        time.strftime('%Y%m%d-%H%M%S') + '/model.h5'

"""
Nature 文章使用的参数, 运行极慢, 请勿轻易尝试
"""
input_shape = (4, 110, 84) # 输入网络大小
batch_size = 32
replay_memory_size = 1000000
target_network_update_freq = 10000
gamma = 0.99
update_freq = 4 # 训练网络的间隔
learning_rate = 0.00025 # 优化器学习率
epsilon = 1. # 初始探索率
min_epsilon = 0.1 # 最终探索率
epsilon_decrease = 9e-7 # 探索减小速度
replay_start_size = 50000 # 开始训练前的经验数
random_initial_steps = 30 # 每个回合开始时随机步数
frames = 50000000 # 整个算法的总训练步数
test_freq = 50000 # 验证智能体的步数间隔
test_episodes = 100 # 每次验证智能体的回合数


"""
小规模参数, 运行时间数小时, 有一点点训练效果
"""
batch_size = 32
replay_memory_size = 50000
target_network_update_freq = 4000
replay_start_size = 10000
random_initial_steps = 30
frames = 100000
test_freq = 25000
test_episodes = 50


# """
# 超小规模参数, 数分钟即可运行完, 基本没有训练效果
# """
# batch_size = 6
# replay_memory_size = 5000
# target_network_update_freq = 80
# replay_start_size = 500
# random_initial_steps = 30
# frames = 7500
# test_freq = 2500
# test_episodes = 10

训练

In [7]:
agent = DQNAgent(env, input_shape=input_shape, batch_size=batch_size,
        replay_memory_size=replay_memory_size,
        learning_rate=learning_rate, gamma=gamma,
        epsilon=epsilon, epsilon_decrease_rate=epsilon_decrease,
        min_epsilon=min_epsilon, random_initial_steps=random_initial_steps,
        load_path=load_path,
        update_freq=update_freq,
        target_network_update_freq=target_network_update_freq)

logging.info("训练开始")

frame = 0
max_mean_episode_reward = float("-inf")
for episode in itertools.count():
    observation = env.reset()
    episode_reward = 0
    state = agent.get_next_state(None, observation)
    for step in itertools.count():
        if render:
            env.render()
        frame += 1
        action = agent.decide(state, step=step)
        observation, reward, done, _ = env.step(action)
        next_state = agent.get_next_state(state, observation)
        episode_reward += reward
        agent.learn(state, action, reward, next_state, done)
        
        # 验证
        if frame % test_freq == 0 or \
                (done and (frame + 1) % test_freq == 0):
            test_episode_rewards = test(env=env,
                    agent=agent, episodes=test_episodes, render=render)
            if max_mean_episode_reward < np.mean(test_episode_rewards):
                max_mean_episode_reward = np.mean(test_episode_rewards)
                agent.save_network(save_path)
                path = save_path[:-2] + str(agent.fit_count) + '.h5'
                agent.save_network(path)
        
        if done:
            step += 1
            frame += 1
            break
        state = next_state
    
    logging.info("回合 {}, 步数 {}, 奖励 {}, 总步数 {}".format(
            episode, step, episode_reward, frame))
    
    if frame > frames:
        break

logging.info("训练结束")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute (Permute)            (None, 110, 84, 4)        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 20, 32)        8224      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 12, 9, 64)         32832     
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 10, 7, 64)         36928     
_________________________________________________________________
flatten (Flatten)            (None, 4480)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               2294272   
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 3078      
Total para

2019-01-01 08:11:15,941 [INFO] 目标网络已更新
2019-01-01 08:11:54,695 [INFO] 训练 8100, 回合 0.97084090000096, 存储大小 32400, 损失 0.010320146568119526
2019-01-01 08:11:55,155 [INFO] 回合 34, 步数 920, 奖励 -19.0, 总步数 32442
2019-01-01 08:12:33,435 [INFO] 训练 8200, 回合 0.9704809000009719, 存储大小 32800, 损失 0.00011695676221279427
2019-01-01 08:13:12,399 [INFO] 训练 8300, 回合 0.9701209000009837, 存储大小 33200, 损失 8.273567073047161e-05
2019-01-01 08:13:45,107 [INFO] 回合 35, 步数 1123, 奖励 -20.0, 总步数 33566
2019-01-01 08:13:52,417 [INFO] 训练 8400, 回合 0.9697609000009956, 存储大小 33600, 损失 0.005170745309442282
2019-01-01 08:14:32,865 [INFO] 训练 8500, 回合 0.9694009000010074, 存储大小 34000, 损失 0.00015319156227633357
2019-01-01 08:15:14,242 [INFO] 训练 8600, 回合 0.9690409000010193, 存储大小 34400, 损失 0.004856262821704149
2019-01-01 08:15:31,999 [INFO] 回合 36, 步数 1048, 奖励 -21.0, 总步数 34615
2019-01-01 08:15:54,264 [INFO] 训练 8700, 回合 0.9686809000010311, 存储大小 34800, 损失 0.005102431867271662
2019-01-01 08:16:34,705 [INFO] 训练 8800, 回合 0.968320900001043, 存储大

2019-01-01 09:08:19,604 [INFO] [测试] 回合 35: 步骤 810, 奖励 -21.0, 步数 29617
2019-01-01 09:08:58,718 [INFO] [测试] 回合 36: 步骤 812, 奖励 -21.0, 步数 30429
2019-01-01 09:09:39,276 [INFO] [测试] 回合 37: 步骤 837, 奖励 -20.0, 步数 31266
2019-01-01 09:10:19,009 [INFO] [测试] 回合 38: 步骤 821, 奖励 -21.0, 步数 32087
2019-01-01 09:10:59,027 [INFO] [测试] 回合 39: 步骤 820, 奖励 -21.0, 步数 32907
2019-01-01 09:11:37,161 [INFO] [测试] 回合 40: 步骤 792, 奖励 -21.0, 步数 33699
2019-01-01 09:12:15,732 [INFO] [测试] 回合 41: 步骤 838, 奖励 -20.0, 步数 34537
2019-01-01 09:12:56,635 [INFO] [测试] 回合 42: 步骤 837, 奖励 -20.0, 步数 35374
2019-01-01 09:13:35,522 [INFO] [测试] 回合 43: 步骤 783, 奖励 -21.0, 步数 36157
2019-01-01 09:14:12,259 [INFO] [测试] 回合 44: 步骤 764, 奖励 -21.0, 步数 36921
2019-01-01 09:14:51,713 [INFO] [测试] 回合 45: 步骤 812, 奖励 -21.0, 步数 37733
2019-01-01 09:15:31,907 [INFO] [测试] 回合 46: 步骤 837, 奖励 -20.0, 步数 38570
2019-01-01 09:16:10,127 [INFO] [测试] 回合 47: 步骤 810, 奖励 -21.0, 步数 39380
2019-01-01 09:16:50,934 [INFO] [测试] 回合 48: 步骤 837, 奖励 -20.0, 步数 40217
2019-01-01 09:17:36,

2019-01-01 10:05:08,106 [INFO] 回合 77, 步数 948, 奖励 -21.0, 总步数 71131
2019-01-01 10:05:27,398 [INFO] 训练 17800, 回合 0.9359209000021097, 存储大小 50000, 损失 0.0001252963556908071
2019-01-01 10:06:20,220 [INFO] 训练 17900, 回合 0.9355609000021216, 存储大小 50000, 损失 0.0001242768339579925
2019-01-01 10:06:49,322 [INFO] 回合 78, 步数 764, 奖励 -21.0, 总步数 71896
2019-01-01 10:07:14,489 [INFO] 训练 18000, 回合 0.9352009000021334, 存储大小 50000, 损失 0.00021455474779941142
2019-01-01 10:08:08,797 [INFO] 训练 18100, 回合 0.9348409000021453, 存储大小 50000, 损失 0.00480900751426816
2019-01-01 10:08:50,656 [INFO] 回合 79, 步数 884, 奖励 -21.0, 总步数 72781
2019-01-01 10:09:03,674 [INFO] 训练 18200, 回合 0.9344809000021571, 存储大小 50000, 损失 0.00011050832108594477
2019-01-01 10:09:58,157 [INFO] 训练 18300, 回合 0.934120900002169, 存储大小 50000, 损失 0.0048782615922391415
2019-01-01 10:10:52,101 [INFO] 训练 18400, 回合 0.9337609000021808, 存储大小 50000, 损失 0.00011736399756046012
2019-01-01 10:10:57,962 [INFO] 回合 80, 步数 939, 奖励 -19.0, 总步数 73721
2019-01-01 10:11:48,371 [INFO

2019-01-01 11:11:47,293 [INFO] 训练 21300, 回合 0.9233209000025245, 存储大小 50000, 损失 0.00011355702736182138
2019-01-01 11:12:17,971 [INFO] 回合 93, 步数 900, 奖励 -21.0, 总步数 85529
2019-01-01 11:12:39,880 [INFO] 训练 21400, 回合 0.9229609000025364, 存储大小 50000, 损失 0.0048707230016589165
2019-01-01 11:13:31,455 [INFO] 训练 21500, 回合 0.9226009000025482, 存储大小 50000, 损失 0.004719461314380169
2019-01-01 11:14:11,380 [INFO] 回合 94, 步数 869, 奖励 -21.0, 总步数 86399
2019-01-01 11:14:24,044 [INFO] 训练 21600, 回合 0.9222409000025601, 存储大小 50000, 损失 0.00017177779227495193
2019-01-01 11:15:16,487 [INFO] 训练 21700, 回合 0.921880900002572, 存储大小 50000, 损失 0.0001753100223140791
2019-01-01 11:15:54,063 [INFO] 回合 95, 步数 783, 奖励 -21.0, 总步数 87183
2019-01-01 11:16:10,823 [INFO] 训练 21800, 回合 0.9215209000025838, 存储大小 50000, 损失 0.013821765780448914
2019-01-01 11:17:04,187 [INFO] 训练 21900, 回合 0.9211609000025957, 存储大小 50000, 损失 0.004503559786826372
2019-01-01 11:17:56,868 [INFO] 训练 22000, 回合 0.9208009000026075, 存储大小 50000, 损失 9.02197789400816e-

2019-01-01 12:09:11,065 [INFO] [测试] 回合 48: 步骤 764, 奖励 -21.0, 步数 38535
2019-01-01 12:09:54,186 [INFO] [测试] 回合 49: 步骤 967, 奖励 -19.0, 步数 39502
2019-01-01 12:09:54,190 [INFO] [测试小结] 步数: 平均 = 790.04, 最小 = 764, 最大 = 967.
2019-01-01 12:09:54,202 [INFO] [测试小结] 奖励: 平均 = -20.82, 最小 = -21.0, 最大 = -19.0
2019-01-01 12:09:54,276 [INFO] 回合 110, 步数 25, 奖励 0.0, 总步数 100002
2019-01-01 12:09:54,284 [INFO] 训练结束


测试

In [8]:
test_agent = DQNAgent(env, input_shape=input_shape, load_path=save_path)
test_episode_rewards = test(env, test_agent, episodes=test_episodes)
print('平均回合奖励 = {}'.format(np.mean(test_episode_rewards)))

2019-01-01 12:09:54,897 [INFO] 载入网络权重 ./output/PongDeterministic-v4-20190124-071322/model.h5.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_2 (Permute)          (None, 110, 84, 4)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 26, 20, 32)        8224      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 12, 9, 64)         32832     
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 10, 7, 64)         36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               2294272   
________________________________________________