## 定义模型



In [15]:
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import parl

class CartpoleModel(parl.Model):
    """ Linear network to solve Cartpole problem.
    Args:
        n_states (int): Dimension of observation space.
        n_actions (int): Dimension of action space.
    """

    def __init__(self, n_states, n_actions):
        super(CartpoleModel, self).__init__()
        hid1_size = 128
        hid2_size = 128
        self.fc1 = nn.Linear(n_states, hid1_size)
        self.fc2 = nn.Linear(hid1_size, hid2_size)
        self.fc3 = nn.Linear(hid2_size, n_actions)

    def forward(self, obs):
        h1 = F.relu(self.fc1(obs))
        h2 = F.relu(self.fc2(h1))
        Q = self.fc3(h2)
        return Q

In [16]:
import parl
import paddle
import numpy as np


class CartpoleAgent(parl.Agent):
    """Agent of Cartpole env.
    Args:
        algorithm(parl.Algorithm): algorithm used to solve the problem.
    """

    def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):
        super(CartpoleAgent, self).__init__(algorithm)
        assert isinstance(n_actions, int)
        self.n_actions = n_actions

        self.global_step = 0
        self.update_target_steps = 200

        self.e_greed = e_greed
        self.e_greed_decrement = e_greed_decrement

    def sample(self, obs):
        """Sample an action `for exploration` when given an observation
        Args:
            obs(np.float32): shape of (n_states,)
        Returns:
            act(int): action
        """
        sample = np.random.random()
        if sample < self.e_greed:
            act = np.random.randint(self.n_actions)
        else:
            if np.random.random() < 0.01:
                act = np.random.randint(self.n_actions)
            else:
                act = self.predict(obs)
        self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)
        return act

    def predict(self, obs):
        """Predict an action when given an observation
        Args:
            obs(np.float32): shape of (n_states,)
        Returns:
            act(int): action
        """
        obs = paddle.to_tensor(obs, dtype='float32')
        pred_q = self.alg.predict(obs)
        act = pred_q.argmax().numpy()[0]
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        """Update model with an episode data
        Args:
            obs(np.float32): shape of (batch_size, n_states)
            act(np.int32): shape of (batch_size)
            reward(np.float32): shape of (batch_size)
            next_obs(np.float32): shape of (batch_size, n_states)
            terminal(np.float32): shape of (batch_size)
        Returns:
            loss(float)
        """
        if self.global_step % self.update_target_steps == 0:
            self.alg.sync_target()
        self.global_step += 1

        act = np.expand_dims(act, axis=-1)
        reward = np.expand_dims(reward, axis=-1)
        terminal = np.expand_dims(terminal, axis=-1)

        obs = paddle.to_tensor(obs, dtype='float32')
        act = paddle.to_tensor(act, dtype='int32')
        reward = paddle.to_tensor(reward, dtype='float32')
        next_obs = paddle.to_tensor(next_obs, dtype='float32')
        terminal = paddle.to_tensor(terminal, dtype='float32')
        loss = self.alg.learn(obs, act, reward, next_obs, terminal)
        return loss.numpy()[0]

In [17]:
import os
import gym
import numpy as np
import parl

from parl.utils import logger, ReplayMemory
from parl.algorithms import DQN

LEARN_FREQ = 5  # training frequency
MEMORY_SIZE = 200000
MEMORY_WARMUP_SIZE = 200
BATCH_SIZE = 64
LEARNING_RATE = 0.0005
GAMMA = 0.99

# train an episode
def run_train_episode(agent, env, rpm):
    total_reward = 0
    obs = env.reset()
    step = 0
    while True:
        step += 1
        action = agent.sample(obs)
        next_obs, reward, done, _ = env.step(action)
        rpm.append(obs, action, reward, next_obs, done)

        # train model
        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
            # s,a,r,s',done
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample_batch(BATCH_SIZE)
            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
                                     batch_next_obs, batch_done)

        total_reward += reward
        obs = next_obs
        if done:
            break
    return total_reward


# evaluate 5 episodes
def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):
    eval_reward = []
    for i in range(eval_episodes):
        obs = env.reset()
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if done:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)


def main(args):
    env = gym.make('CartPole-v0')
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))

    # set action_shape = 0 while in discrete control environment
    rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)

    # build an agent
    model = CartpoleModel(n_states=n_states, n_actions=n_actions)
    alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)
    agent = CartpoleAgent(
        alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)

    # warmup memory
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_train_episode(agent, env, rpm)

    max_episode = args.max_episode

    # start training
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(50):
            total_reward = run_train_episode(agent, env, rpm)
            episode += 1

        # test part
        eval_reward = run_evaluate_episodes(agent, env, render=False)
        logger.info('episode:{}    e_greed:{}   Test reward:{}'.format(
            episode, agent.e_greed, eval_reward))

    # save the parameters to ./model.ckpt
    save_path = './model.ckpt'
    agent.save(save_path)

    # save the model and parameters of policy network for inference
    save_inference_path = './inference_model'
    input_shapes = [[None, env.observation_space.shape[0]]]
    input_dtypes = ['float32']
    agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)



    

In [18]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
    '--max_episode',
    type=int,
    default=1000,
    help='stop condition: number of max episode')
args = parser.parse_args(args=[])

main(args)

[32m[08-01 21:48:19 MainThread @3996942455.py:64][0m obs_dim 4, act_dim 2
[32m[08-01 21:48:19 MainThread @3996942455.py:92][0m episode:50    e_greed:0.0988929999999989   Test reward:18.4
[32m[08-01 21:48:20 MainThread @3996942455.py:92][0m episode:100    e_greed:0.09794799999999795   Test reward:9.6
[32m[08-01 21:48:20 MainThread @3996942455.py:92][0m episode:150    e_greed:0.0973899999999974   Test reward:37.8
[32m[08-01 21:48:20 MainThread @3996942455.py:92][0m episode:200    e_greed:0.09684299999999685   Test reward:8.8
[32m[08-01 21:48:20 MainThread @3996942455.py:92][0m episode:250    e_greed:0.09635499999999636   Test reward:9.4
[32m[08-01 21:48:21 MainThread @3996942455.py:92][0m episode:300    e_greed:0.09585299999999586   Test reward:9.2
[32m[08-01 21:48:21 MainThread @3996942455.py:92][0m episode:350    e_greed:0.09535799999999536   Test reward:9.2
[32m[08-01 21:48:21 MainThread @3996942455.py:92][0m episode:400    e_greed:0.09486399999999487   Test reward:1