In [1]:
#!/usr/bin/env python3
import os
import time
import gym
import copy
import sys

import argparse
from tensorboardX import SummaryWriter
import numpy as np
import collections
from collections import namedtuple, deque

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [2]:
def float32_preprocessor(states):
    """
    Convert list of states into the form suitable for model. By default we assume Variable

    Args:
        states: list of numpy arrays with states

    Returns:
        cleaned variable in the form of np.float32
    """

    np_states = np.array(states, dtype=np.float32)
    return torch.tensor(np_states)


# Models

In [3]:
class DDPGActor(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGActor, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, act_size),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)

In [4]:
class DDPGCritic(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGCritic, self).__init__()

        self.obs_net = nn.Sequential(
            nn.Linear(obs_size, 128),
            nn.ReLU(),
        )

        self.out_net = nn.Sequential(
            nn.Linear(128 + act_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x, a):
        obs = self.obs_net(x)
        return self.out_net(torch.cat([obs, a], dim=1))


In [5]:
class TargetNetwork:
    """
    Wrapper around model which provides copy of it instead of trained weights
    """

    def __init__(self, model):
        self.model = model
        self.target_model = copy.deepcopy(model)

    def sync(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def alpha_sync(self, alpha):
        """
        Blend params of target net with params from the model
        :param alpha:
        """
        assert isinstance(alpha, float)
        assert 0.0 < alpha <= 1.0
        
        state = self.model.state_dict()
        tgt_state = self.target_model.state_dict()
        for k, v in state.items():
            tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v

        self.target_model.load_state_dict(tgt_state)

# Memory

In [6]:
class ReplayMemory:
    def __init__(self, state_dims, action_dims, buffer=1000000, min_buffer=50000, batch=64):
        self.buffer_size = buffer
        self.min_buffer_size = min_buffer
        self.state_dims = state_dims
        self.action_dims = action_dims
        self.batch_size = batch
        self.count = 0
        self.current = 0

        # preallocate memory
        self.actions = np.empty((self.buffer_size,) + self.action_dims, dtype = np.float32)
        self.rewards = np.empty(self.buffer_size, dtype = np.float32)
        self.states = np.empty((self.buffer_size,) + self.state_dims, dtype = np.float32)
        self.terminals = np.empty(self.buffer_size, dtype = np.bool)   
        
        self.state_batch = np.empty((self.batch_size,) + self.state_dims, dtype = np.float32)
        self.next_state_batch = np.empty((self.batch_size,) + self.state_dims, dtype = np.float32)
        
        
    def add(self, action, reward, state, terminal):        
        assert state.shape == self.state_dims
        assert action.shape == self.action_dims

        self.actions[self.current, ...] = action
        self.rewards[self.current] = reward
        self.states[self.current, ...] = state
        self.terminals[self.current] = terminal
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.buffer_size
        
  
    def getState(self, index):
        # Returns the state at position 'index'.
        return self.states[index, ...]
         

    def getMinibatch(self):
        # memory should be initially populated with random actions up to 'min_buffer_size'
        assert self.count >= self.min_buffer_size, "Replay memory does not contain enough samples to start learning, take random actions to populate replay memory"
                
        # sample random indexes
        indexes = []
        # do until we have a full batch of states
        while len(indexes) < self.batch_size:
            # find random index 
            while True:
                # sample one index
                index = np.random.randint(1, self.count)
                # check index is ok
                # if state and next state wrap over current pointer, then get new one (as state from current pointer position will not be from same episode as state from previous position)
                if index == self.current:
                    continue
                # if state and next state wrap over episode end, i.e. current state is terminal, then get new one (note that next state can be terminal)
                if self.terminals[index-1]:
                    continue
                # index is ok to use
                break
            
            # Populate states and next_states with selected state and next_state
            # NB! having index first is fastest in C-order matrices
            self.state_batch[len(indexes), ...] = self.getState(index - 1)
            self.next_state_batch[len(indexes), ...] = self.getState(index)
            indexes.append(index)   
        
        actions = self.actions[indexes]
        rewards = self.rewards[indexes]
        terminals = self.terminals[indexes]
        
        return self.state_batch, actions, rewards, self.next_state_batch, terminals

# Runner

# Noise

In [7]:
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

# Agent

# Observe

In [8]:
def observe(env, replay_mem):
    sys.stdout.write('\nPopulating replay memory with random actions...\n')   
    sys.stdout.flush()          
    env.reset()

    for random_step in range(1, OBSERVATION+1):
        if RENDER:
            env.render()

        action = env.action_space.sample()
        state, reward, terminal, _ = env.step(action)
        replay_mem.add(action, reward, state, terminal)

        if terminal:
            env.reset()

        if random_step % 1000 == 0:
            sys.stdout.write('\x1b[2K\rStep {:d}/{:d}'.format(random_step, OBSERVATION))
            sys.stdout.flush() 



# Update Network

In [9]:
def update_networks(device, act_net, crt_net, act_opt, crt_opt, replay_mem, gamma=0.99):

    # Get minibatch
    states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch = replay_mem.getMinibatch()
    
    
    print()
    
    #preprocess 
    
    states, actions, rewards, dones, last_states = [], [], [], [], []
    
    for s,a,r,s_,d in zip(states_batch, actions_batch, rewards_batch, next_states_batch, terminals_batch):
        states.append(s)
        actions.append(a)
        rewards.append(r)
        dones.append(s_ is None)
        if s_ is None:
            last_states.append(s)
        else:
            last_states.append(s_)
            
            
    states_v = float32_preprocessor(states).to(device)
    actions_v = float32_preprocessor(actions).to(device)
    rewards_v = float32_preprocessor(rewards).to(device)
    last_states_v = float32_preprocessor(last_states).to(device)
    dones_t = float32_preprocessor(dones).to(device)
  
    
    # Critic training step 
    actions_next = tgt_act_net.target_model(last_states_v)
    Q_targets_next = tgt_crt_net.target_model(last_states_v, actions_next)
    
    Q_targets = rewards_v + (gamma * Q_targets_next * (1 - dones_t))
    
    Q_expected = crt_net(states_v, actions_v)
    
    critic_loss = F.mse_loss(Q_expected, Q_targets)
    
    crt_opt.zero_grad()
    critic_loss.backward()
#     torch.nn.utils.clip_grad_norm_(crt_net.parameters(), 1)
    crt_opt.step()
    
#     crt_opt.zero_grad()
#     # Predict actions for next states by passing next states through policy target network
#     future_action = tgt_act_net.target_model(last_states_v)
#     q_value = crt_net(states_v, actions_v)
#     # Predict target Q values by passing next states and actions through value target network
# #     future_Q = sess.run(critic_target.output, {state_ph:next_states_batch, action_ph:future_action})[:,0]
#     future_Q = tgt_crt_net.target_model(last_states_v, future_action)
#     # Q values of the terminal states is 0 by definition
#     future_Q[dones_t] = 0
#     targets = rewards_v + (future_Q * gamma)
    
#     critic_loss = F.mse_loss(q_value, future_Q)
#     critic_loss.backward()
#     crt_opt.step()
    
    
    # Actor training step
    actions_pred = act_net(states_v)
    actor_loss = -crt_net(states_v, actions_pred).mean()
    act_opt.zero_grad()
    actor_loss.backward()
    act_opt.step()
    
#     act_opt.zero_grad()
#     # Get policy network's action outputs for selected states
#     actor_actions = act_net(states_v)
#     actions_pred = act_net(states_v)
#     actor_loss = -crt_net(states_v, actions_pred).mean()
#     # Minimize the loss
#     actor_loss.backward()
#     act_opt.step()

    tgt_act_net.alpha_sync(alpha=TAU)
    tgt_crt_net.alpha_sync(alpha=TAU)
    

# Config

In [10]:
NAME = "msinto"
ENV = "Pendulum-v0"
SEED = 99999999
NOISE_SCALE = 0.1
CUDA = False
LRA = 0.0001
LRC = 0.001
RENDER = False
OBSERVATION = 50000
EXPLORATION = 1000
SAVE_CP = 200
TAU = 0.001


# Main

In [11]:

# Create Environment
env = gym.make(ENV)
state_dims = env.observation_space.shape
action_dims = env.action_space.shape
action_bound_low = env.action_space.low
action_bound_high = env.action_space.high

# Set random seeds for reproducability
env.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Initialise replay memory
replay_mem = ReplayMemory(state_dims, action_dims)

# Initialise Ornstein-Uhlenbeck Noise generator
exploration_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dims))
noise_scaling = NOISE_SCALE * (action_bound_high - action_bound_low)

# Networks
device = torch.device("cuda" if CUDA else "cpu")

act_net = DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
crt_net = DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device)


print(act_net)
print(crt_net)

tgt_act_net = TargetNetwork(act_net)
tgt_crt_net = TargetNetwork(crt_net)

# setup save directory
save_path = os.path.join("saves", "ddpg-" + NAME)
os.makedirs(save_path, exist_ok=True)

# network optimizers
act_opt = optim.Adam(act_net.parameters(), lr=LRA)
crt_opt = optim.Adam(crt_net.parameters(), lr=LRC)

# Writer
writer = SummaryWriter(comment="-ddpg_" + NAME)


DDPGActor(
  (net): Sequential(
    (0): Linear(in_features=3, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
    (5): Tanh()
  )
)
DDPGCritic(
  (obs_net): Sequential(
    (0): Linear(in_features=3, out_features=128, bias=True)
    (1): ReLU()
  )
  (out_net): Sequential(
    (0): Linear(in_features=129, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)


# Training

In [12]:
# Initially populate replay memory by taking random actions 
observe(env, replay_mem)

sys.stdout.write('\n\nTraining...\n')   
sys.stdout.flush()

start_ep = 0
total_rewards = []

for train_ep in range(start_ep+1, EXPLORATION+1):      
    # Reset environment and noise process
    state = env.reset()
    exploration_noise.reset()
    
    train_step = 0
    episode_reward = 0
    duration_values = []
    ep_done = False
    
    
#     sys.stdout.write('\n')   
#     sys.stdout.flush()

    while not ep_done:
        train_step += 1
        start_time = time.time()            
       
        action = act_net(float32_preprocessor(state).to(device)).cpu().data.numpy()
        action += exploration_noise() * noise_scaling
        
        state, reward, terminal, _ = env.step(action)
        replay_mem.add(action, reward, state, terminal)
        
        episode_reward += reward 
            
        update_networks(device, act_net, crt_net, act_opt, crt_opt, replay_mem)
        
        
        # Display progress            
        duration = time.time() - start_time
        duration_values.append(duration)
        ave_duration = sum(duration_values)/float(len(duration_values))
        
        
            
        if terminal:
            total_rewards.append(episode_reward)
            reward100 = total_rewards[-100:]
            avg_reward = sum(reward100)/len(reward100)
            sys.stdout.write('\x1b[2K\rEpisode {:d}/{:d} \t Avg Reward = {:.3f} \t Reward = {:.3f} \t ({:.3f} s/step)'.format(train_ep, EXPLORATION, avg_reward, episode_reward, ave_duration))
            sys.stdout.flush() 
            ep_done = True
            
    
    if train_ep % SAVE_CP == 0:
        name = "ep_%d.dat" % (train_ep)
        fname = os.path.join(save_path, name)
        torch.save(act_net.state_dict(), fname)
        sys.stdout.write('\n Checkpoint saved.')   
        sys.stdout.flush() 


Populating replay memory with random actions...
Step 50000/50000[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K

Training...
Episode 60/1000 	 Avg Reward = -1522.868 	 Reward = -1196.262 	 (0.005 s/step)[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K

KeyboardInterrupt: 