In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from model import NoisyDuelingMLP
from replay_buffer import PrioritizedReplayBuffer
from baselines.common.schedules import LinearSchedule
import gym

In [2]:
class DDQNAgent():
    def __init__(self, args, env):
        self.args = args
        self.dtype = torch.FloatTensor
        self.atype = torch.LongTensor
        
        self.model = NoisyDuelingMLP(4, 2, self.dtype, args.sigma_init)
        self.target_model = NoisyDuelingMLP(4, 2, self.dtype, args.sigma_init)
        self.env = env
        self.huber_loss = nn.SmoothL1Loss(reduce=False)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)

    
    def sample_noise(self):
        self.model.sample_noise()
        self.target_model.sample_noise()
        
        
    def reset_noise(self):
        self.model.reset_noise()
        self.target_model.reset_noise()
        
        
    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())
        
        
    def act(self, ob):
        ob_var = Variable(torch.from_numpy(ob).type(self.dtype)).view(-1, 4)
        q_out = self.model(ob_var)
        _, deterministic_actions = q_out.data.max(1)
        out = deterministic_actions.cpu().numpy().astype(np.int32).reshape(-1)
        return out[0]
    
    
    def update(self, obs, actions, rewards, next_obs, dones, weights):
        obs = Variable(torch.from_numpy(obs).type(torch.FloatTensor)).view(-1, 4)
        next_obs = Variable(torch.from_numpy(next_obs).type(torch.FloatTensor)).view(-1, 4)
        dones = Variable(torch.from_numpy(dones.astype(float)).type(torch.FloatTensor)).view(-1, 1)
        rewards = Variable(torch.from_numpy(rewards).type(torch.FloatTensor)).view(-1, 1)
        actions = Variable(torch.from_numpy(actions.astype(int)).type(torch.LongTensor)).view(-1, 1)
        weights = Variable(torch.from_numpy(weights).type(torch.FloatTensor)).view(-1, 1)
        
        # Compute Bellman loss -> DDQN
        q_next = self.target_model(next_obs).detach()
        _, best_actions = self.model(next_obs).detach().max(1)
        q_next_best = q_next.gather(1, best_actions.view(-1, 1))
        q_next_best_rhs = rewards + self.args.gamma * q_next_best * (1 - dones)
        q = self.model(obs)
        q = q.gather(1, actions).squeeze(1)
        
        td_errors = q.data.view(-1, 1) - q_next_best_rhs.data.view(-1, 1)
        errors = self.huber_loss(q, q_next_best_rhs)
        weighted_error = (weights * errors).mean()
        
        # Step optimizer
        self.optimizer.zero_grad()
        weighted_error.backward()
        self.optimizer.step()
        
        return td_errors.numpy().flatten()

In [3]:
class Params():
    def __init__(self):
        # Prioritized replay params
        self.prioritized_replay = True
        self.prioritized_replay_alpha = 0.5
        self.prioritized_replay_beta0 = 0.4
        self.prioritized_replay_beta_iters = None
        self.prioritized_replay_eps = 1e-6
        
        self.gamma = 0.99
        self.sigma_init = 0.17
        
        self.batch_size = 32
        self.buffer_size = 50000
        self.exploration_steps = 1000
        self.max_timesteps = 100000
        self.exploration_fraction = 0.1
        self.exploration_final_eps = 0.02
        self.learning_starts = 1000
        self.train_freq = 1
        self.target_network_update_freq = 500
        self.print_freq = 10

In [4]:
args = Params()
env = gym.make('CartPole-v0')
agent = DDQNAgent(args, env)

replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha)
args.prioritized_replay_beta_iters = args.max_timesteps
beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, 
                                initial_p=args.prioritized_replay_beta0, 
                                final_p=1.0)

episode_rewards = [0.0]
saved_mean_reward = None
agent.sample_noise()
agent.update_target()
ob = env.reset()

for t in range(args.max_timesteps):
    action = agent.act(ob)
    
    new_ob, rew, done, _ = env.step(action)
    # Store transition in the replay buffer.
    replay_buffer.add(ob, action, rew, new_ob, float(done))
    ob = new_ob
    
    episode_rewards[-1] += rew
    if done:
        ob = env.reset()
        episode_rewards.append(0.0)
        reset = True
        
    if t > args.learning_starts and t % args.train_freq == 0:
        experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t))
        (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience
        agent.sample_noise()
        kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights)
        replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6)
        
        
    if t > args.learning_starts and t % args.target_network_update_freq == 0:
        # Update target network periodically.
        agent.update_target()

    if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0:
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        print('Step {} episode {} mean 100ep reward {}'.format(t, num_episodes, mean_100ep_reward))
    

[2017-12-15 06:30:00,346] Making new env: CartPole-v0


Step 84 episode 10 mean 100ep reward 9.4
Step 176 episode 20 mean 100ep reward 9.3
Step 272 episode 30 mean 100ep reward 9.4
Step 365 episode 40 mean 100ep reward 9.4
Step 458 episode 50 mean 100ep reward 9.4
Step 552 episode 60 mean 100ep reward 9.4
Step 645 episode 70 mean 100ep reward 9.4
Step 738 episode 80 mean 100ep reward 9.4
Step 829 episode 90 mean 100ep reward 9.3
Step 927 episode 100 mean 100ep reward 9.4
Step 1035 episode 110 mean 100ep reward 9.5
Step 1141 episode 120 mean 100ep reward 9.6
Step 1242 episode 130 mean 100ep reward 9.7
Step 1341 episode 140 mean 100ep reward 9.8
Step 1447 episode 150 mean 100ep reward 9.9
Step 1548 episode 160 mean 100ep reward 10.0
Step 1646 episode 170 mean 100ep reward 10.0
Step 1749 episode 180 mean 100ep reward 10.1
Step 1855 episode 190 mean 100ep reward 10.3
Step 1955 episode 200 mean 100ep reward 10.3
Step 2056 episode 210 mean 100ep reward 10.2
Step 2155 episode 220 mean 100ep reward 10.1
Step 2252 episode 230 mean 100ep reward 10.1


KeyboardInterrupt: 