In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from utils import ReplayBuffer, LinearSchedule
import gym
import pickle

from logger import Logger

  return f(*args, **kwds)


In [2]:
class FCNet(nn.Module):
    def __init__(self, dims):
        super(FCNet, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 2)
        # self.fc3 = nn.Linear(dims[2], dims[3])

    def forward(self, x):
        x = x.view(-1, 4)
        y = F.relu(self.fc1(x))
        # y = F.relu(self.fc2(y))
        y = self.fc2(y)
        return y

In [3]:
class DQNAgent():
    def __init__(self, args, env):
        self.args = args
        self.online_network = FCNet([4, 50, 200, 2]) # Learns faster with a bigger network
        self.target_network = FCNet([4, 50, 200, 2])
        # args.network_fn = lambda optimizer_fn: DuelingFCNet([8, 50, 200, 2], optimizer_fn)
        self.total_steps = 0
        self.history_buffer = None
        self.env = env
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.online_network.parameters(), lr=1e-3)
        
        
    def save(self, file_name):
        torch.save(self.online_network.state_dict(), file_name)
        
        
    def update_target(self):
        self.target_network.load_state_dict(self.online_network.state_dict())
       
    
    def test_act(self, ob_var, eps, deterministic=False):
        ob_var = ob_var.view(-1, 4)
        q_out = self.online_network(ob_var)
        _, deterministic_actions = q_out.max(1)
        deterministic_actions = deterministic_actions.data

        batch_size = ob_var.size(0)
        random_actions = torch.LongTensor(batch_size).random_(0, 2)
        choose_random = torch.Tensor(batch_size).uniform_(0, 1) < eps
        
        deterministic_actions[choose_random == 1] = 0 
        random_actions[choose_random == 0] = 0
        stochastic_actions = deterministic_actions + random_actions
        out = stochastic_actions.cpu().numpy().astype(int).reshape(-1)
        return out[0]
    
    
    def act(self, ob_var, eps, deterministic=False):
        action_value = self.online_network(ob_var)
        action_value = action_value.cpu().data.numpy().flatten()
        
        if deterministic:
            return np.argmax(action_value)
        if np.random.rand() < eps:
            return np.random.randint(0, len(action_value))
        return np.argmax(action_value)
        
        
    def test_update(self, obs, actions, rewards, next_obs, dones, weights):
        obs = Variable(torch.from_numpy(obs).type(torch.FloatTensor)).view(-1, 4)
        next_obs = Variable(torch.from_numpy(next_obs).type(torch.FloatTensor)).view(-1, 4)
        dones = Variable(torch.from_numpy(dones.astype(float)).type(torch.FloatTensor)).view(-1, 1)
        rewards = Variable(torch.from_numpy(rewards).type(torch.FloatTensor)).view(-1, 1)
        actions = Variable(torch.from_numpy(actions.astype(int)).type(torch.LongTensor)).view(-1, 1)     

        q_out = self.online_network(obs)
        q_out_selected = q_out.gather(1, actions)

        # Double Q, compute the Q-value of the next state based on the target network
        # based on action chosen by the online network
        next_q_out_online = self.online_network(next_obs).detach()
        next_q_out_target = self.target_network(next_obs).detach()
        _, next_q_best_online = next_q_out_online.max(1)
        next_q_best = next_q_out_target.gather(1, next_q_best_online.view(-1, 1))

        # compute RHS of bellman equation
        # 0 Q-Value if episode terminates
        next_q_best_masked = (1.0 - dones) * next_q_best
        q_out_selected_target = rewards + self.args.discount * next_q_best_masked

        # compute the error (potentially clipped)
        loss = self.criterion(q_out_selected, q_out_selected_target)
        # weighted_errors = (td_errors * weights).mean()

        # Optimizer step
        self.optimizer.zero_grad()
        loss.backward()
        # nn.utils.clip_grad_norm(self.net.parameters(), self.args.grad_norm_clipping)
        self.optimizer.step()
        
        
    def update(self, obs, actions, rewards, next_obs, dones, weights):
        obs = Variable(torch.from_numpy(obs).type(torch.FloatTensor)).view(-1, 4)
        next_obs = Variable(torch.from_numpy(next_obs).type(torch.FloatTensor)).view(-1, 4)
        dones = Variable(torch.from_numpy(dones.astype(float)).type(torch.FloatTensor)).view(-1, 1)
        rewards = Variable(torch.from_numpy(rewards).type(torch.FloatTensor)).view(-1, 1)
        actions = Variable(torch.from_numpy(actions.astype(int)).type(torch.LongTensor)).view(-1, 1)

        # Compute Bellman loss -> DDQN
        q_next = self.target_network(next_obs).detach()
        _, best_actions = self.online_network(next_obs).detach().max(1)
        q_next_best = q_next.gather(1, best_actions.view(-1, 1))
        q_next_best_rhs = rewards + self.args.discount * q_next_best * (1 - dones)
        # q_next_best_rhs.add_(rewards)
        q = self.online_network(obs)
        q = q.gather(1, actions).squeeze(1)
        loss = self.criterion(q, q_next_best_rhs)
        
        # Step optimizer
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [4]:
class Params():
    def __init__(self):
        self.discount = 0.99
        self.test_repetitions = 50
        self.double_q = True
        self.test_interval = 100
        self.test_repetitions = 50
        self.success_threshold = 195
        
        self.batch_size = 32
        self.buffer_size = 50000
        self.exploration_steps = 1000
        self.max_timesteps = 100000
        self.exploration_fraction = 0.1
        self.exploration_final_eps = 0.02
        self.learning_starts = 1000
        self.train_freq = 1
        self.target_network_update_freq = 500
        self.print_freq = 10

In [5]:
args = Params()
env = gym.make('CartPole-v0')
agent = DQNAgent(args, env)

# logger.configure('./log', ['csv', 'stdout'])
exploration = LinearSchedule(schedule_timesteps=int(args.exploration_fraction * args.max_timesteps),
                                    initial_p=1.0,
                                    final_p=args.exploration_final_eps)
replay_buffer = ReplayBuffer(args.buffer_size)
episode_rewards = [0.0]
saved_mean_reward = None
reset = True
ob = env.reset()

# Copy params to target network
agent.update_target()

for t in range(args.max_timesteps):
    # This callback is nicely done        
    update_eps = exploration.value(t)
    update_param_noise_threshold = 0.

    ob_var = Variable(torch.from_numpy(ob).type(torch.FloatTensor))
    # action = agent.act(ob_var, update_eps)
    action = agent.test_act(ob_var, update_eps)

    reset = False
    new_ob, rew, done, _ = env.step(action)
    # Store transition in the replay buffer.
    replay_buffer.add(ob, action, rew, new_ob, float(done))
    ob = new_ob

    episode_rewards[-1] += rew
    if done:
        ob = env.reset()
        episode_rewards.append(0.0)
        reset = True

    if t > args.learning_starts and t % args.train_freq == 0:
        obs, actions, rewards, next_obs, dones = replay_buffer.sample(args.batch_size)
        weights, batch_idxes = np.ones_like(rewards), None
        # Do training
        # agent.update(obs, actions, rewards, next_obs, dones, weights)
        agent.test_update(obs, actions, rewards, next_obs, dones, weights)
        
        
    if t > args.learning_starts and t % args.target_network_update_freq == 0:
        # Update target network periodically.
        agent.update_target()

    if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0:
        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        print('Step {} episode {} mean 100ep reward {}'.format(t, num_episodes, mean_100ep_reward))


[2017-12-10 20:09:41,356] Making new env: CartPole-v0


Step 197 episode 10 mean 100ep reward 22.0
Step 426 episode 20 mean 100ep reward 22.5
Step 638 episode 30 mean 100ep reward 22.0
Step 872 episode 40 mean 100ep reward 22.4
Step 1068 episode 50 mean 100ep reward 21.8
Step 1223 episode 60 mean 100ep reward 20.7
Step 1417 episode 70 mean 100ep reward 20.6
Step 1599 episode 80 mean 100ep reward 20.3
Step 1811 episode 90 mean 100ep reward 20.4
Step 2048 episode 100 mean 100ep reward 20.7
Step 2239 episode 110 mean 100ep reward 20.4
Step 2461 episode 120 mean 100ep reward 20.4
Step 2750 episode 130 mean 100ep reward 21.1
Step 3101 episode 140 mean 100ep reward 22.3
Step 3427 episode 150 mean 100ep reward 23.6
Step 3763 episode 160 mean 100ep reward 25.4
Step 4224 episode 170 mean 100ep reward 28.1
Step 5171 episode 180 mean 100ep reward 35.7
Step 6675 episode 190 mean 100ep reward 48.6
Step 8450 episode 200 mean 100ep reward 64.0
Step 10401 episode 210 mean 100ep reward 81.6
Step 12396 episode 220 mean 100ep reward 99.4
Step 14396 episode 23

KeyboardInterrupt: 