In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import gym
import random
import imageio
import numpy as np
import tensorflow as tf
import keras.backend as K

from tensorflow import keras
from keras.models import Model
from keras.layers import Dense, GaussianNoise, Input, concatenate
from keras.layers import BatchNormalization, Flatten, Lambda
from keras.optimizers import Adam
from keras.initializers import RandomUniform

import sys
sys.path.append("..")
from core.replay_experience import ReplayMemory, Transition
from models.reward_functions import Identity


from pdb import set_trace as debug

In [4]:
# https://github.com/germain-hug/Deep-RL-Keras/tree/master/DDPG


class Critic:
    def __init__(self, obs_shape, act_shape, *args):
        self.obs_shape = obs_shape
        self.act_shape = act_shape
            
    
        
    def __init_model__(self, model_params = [256, 128, 64]):
        
        inp_1 = Input((self.obs_shape))
        inp_2 = Input((self.act_shape))
        
        x = Dense(model_params[0], activation='relu')(inp_1)
        x = concatenate([x, inp_2])
        for m in model_params[1:]:
            x = Dense(m, activation='relu')(x)
            x = BatchNormalization()(x)
        out = Dense(1, activation='linear', kernel_initializer=RandomUniform())(x)
        self.model = Model([inp_1, inp_2], out)
        
    def __build_opt__(self, lr, b1, b2):
        return Adam(
            learning_rate=lr,
            beta_1 = b1,
            beta_2 = b2,
            clipvalue=0.5
        )
    
    def init_model(self):
        self.__init_model__()
    
    
    def build_opt(self, learning_rate, beta_1, beta_2):
        self.model.compile(optimizer=self.__build_opt__(learning_rate, beta_1, beta_2), loss='mse')
   
    
    def predict(self, st, at):
        if(len(st.shape) < 2):
            assert len([st]) == len(at), 'mismatch between number of samples'
            return self.model.predict([[st], at])
        else:
            assert len(st) == len(at), 'mistmatch between number of samples'
            return self.model.predict([st, at])
    
    def transfer_weights(self, model, tau):
        self.model.set_weights(
            [tau*l1 + (1-tau)*l2 for l1, l2 in zip(self.model.get_weights(), model.get_weights())]
        )

        
class Actor:
    def __init__(self, obs_shape, act_shape, act_range):
        self.obs_shape = obs_shape
        
        assert len(act_shape) < 2, "Only Box environment allowed"

        self.act_shape = act_shape[0] 
        self.act_range = act_range
        
    def __init_model__(self, model_params=[256, 128, 64]):
        inp = Input((self.obs_shape))
        x = Dense(model_params[0], activation='relu')(inp)
        for m in model_params[1:]:
            x = Dense(m, activation='relu')(x)
            x = BatchNormalization()(x)
        
        x = GaussianNoise(1.0)(x)
        # puts action out vals between 0 and 1
        out = Dense(self.act_shape, activation='tanh', kernel_initializer=RandomUniform())(x)
        
        # set to the correct range
        out = Lambda(lambda i: i * self.act_range)(out)
        self.model= Model(inp, out)
        
    def __build_opt__(self, lr, b1, b2):
        
        
        
        opt = Adam(learning_rate=lr, beta_1=b1, beta_2=b2)
        
        # build function to apply gradients to actor model
        act_grads = K.placeholder(shape=(None, self.act_shape))
#         clipped_grads = K.clip(act_grads, -0.5, 0.5)
#         mean_grad = K.sum(clipped_grads, axis=0)
        update_params = tf.gradients(self.model.output, self.model.trainable_weights, -clipped_grads)
        grads = zip(update_params, self.model.trainable_weights)
        
        
        
        return K.function(
            inputs=[self.model.input, act_grads], outputs=[mean_grad],
            updates=[tf.train.AdamOptimizer(learning_rate=lr, beta1=b1, beta2=b2).apply_gradients(grads)][1:]
        )
    
    
    def init_model(self):
        self.__init_model__()
        
    def build_opt(self, learning_rate, beta_1, beta_2):
        return self.__build_opt__(learning_rate, beta_1, beta_2)
        
    def predict(self, st):
        if(len(st.shape) < 2):
            return self.model.predict(np.expand_dims(st, axis=0))
        else:
            return self.model.predict(st)
        
    def transfer_weights(self, model, tau):
        self.model.set_weights(
            [tau*l1 + (1-tau)*l2 for l1, l2 in zip(self.model.get_weights(), model.get_weights())]
        )

    
    
        
        
        

In [8]:
class DDPGAgent:
    def __init__(
        self, env, reward_class, batch_size=256, memory_size=1028, gamma=0.95, epsilon = 1.0, 
        epsilon_min=0.01, epsilon_decay=0.995, exploration_fraction=0.1, update_timesteps=50, 
        tau=0.01, learning_rate = 0.001, beta_1 = 0.9, beta_2 = 0.99, logger_steps = 500,
        learning_starts = 500, render=False
    ):
        # learning parameters
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.tau = tau
        self.update_timesteps = update_timesteps
        self.exploration_fraction = exploration_fraction
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.epsilon = epsilon
        self.gamma = gamma
        self.memory = ReplayMemory(capacity=memory_size)
        self.batch_size = batch_size
        self.reward_function = reward_class().reward_func
        self.learning_starts = learning_starts
        
        # environment parameters
        self.env = env
        self.render = render
        
        # setup models and optimizers
        self.behavior_q = Critic(self.env.observation_space.shape, self.env.action_space.shape)
        self.behavior_q.init_model()
        self.behavior_q.build_opt(self.learning_rate, self.beta_1, self.beta_2)
        
        self.target_q = Critic(self.env.observation_space.shape, self.env.action_space.shape)
        self.target_q.init_model()

        # self.target_q.build_opt(self.learning_rate, self.beta_1, self.beta_2) not entirely necessary since we'll be updating the weights heuristically
        
        self.behavior_pi = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.env.action_space.high)
        self.behavior_pi.init_model()
        self.behavior_pi_AdamOpt = self.behavior_pi.build_opt(self.learning_rate, self.beta_1, self.beta_2)
        
        self.target_pi = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.env.action_space.high)
        self.target_pi.init_model()
        self.transfer_weights()
        
        # logging parameters
        self.logging_step = logger_steps
        self._current_timestep = 0
        self._num_episodes = 1
        self._eps_rew_list = []
        self._mean_eps_rew = 0
        self._eps_rew = 0
        
        self.__parameter_dict = None
        
    def update_dictionary(self):
        self.__parameter_dict = {key:value for key, value in self.__dict__.items() if not key.startswith('__') and not callable(key)}

        

    
    def _finished_episode(self):
        # logging reward from environment not from shap
        self._num_episodes += 1
        self._eps_rew_list.append(self._eps_rew)
        self._mean_eps_rew = sum(self._eps_rew_list)/len(self._eps_rew_list)
        self._eps_rew = 0
        return self.env.reset()
        
    def act(self, st):
        
        if np.random.rand() <=self.epsilon:
            at = self.env.action_space.sample()
        else:
            at = self.behavior_pi.predict(st)[0]

        
        snext, rt, done, _ = self.env.step(at)
        if self.render:
            self.env.render()
        
        if done:
            snext = self._finished_episode()
            
        self._eps_rew += rt
        self.memory.push(st, at, snext, done, rt)
        return snext
        

    def transfer_weights(self):
        self.target_q.transfer_weights(self.behavior_q.model, self.tau)
        self.target_pi.transfer_weights(self.behavior_pi.model, self.tau)    

        
    
    def update_on_batch(self):        
        """
        primed => target model
        non-primed => behavior model
        y_j = r_j = \gamma*Q'(s_{j+1}, \mu'(s_{j+1} | \theta^{\mu'}) | \theta^{Q'})
        L_Q = \frac{1}{N} \sum_j (y_j - Q(s_j, a_j | \theta^Q))^2 ==> squared bellman update
        \div_{\theta^{\mu}} J = \frac{1}{N} \sum_j \div_a Q(s_j, \mu(s_j | \theta^{\mu}) | \theta^Q)
        """
        
        batch = Transition(*zip(*self.memory.sample(self.batch_size)))
        
        states = np.array(batch.state)
        actions = np.array(batch.action)
        states_tp1 = np.array(batch.next_state)
        actions_tp1 = self.target_pi.model.predict(states_tp1)
            
        #### CRITIC UPDATE ####
        mask = np.ones(self.batch_size)*([not l for l in batch.done])
        mask = mask.reshape((-1, 1))

        y = self.target_q.model.predict([states_tp1, actions_tp1])
        y *= self.gamma
        y *= mask
        
        # apply shap updates here if desired
        tmp, self.__parameter_dict = self.reward_function(batch, **self.__parameter_dict)
        
        y += np.array(tmp).reshape((-1, 1))
        self.behavior_q.model.fit([states, actions], y, verbose=0)
        
        
        #### ACTOR UPDATE ####
        # get actions from actor model
        acts = self.behavior_pi.predict(states)
        
        # get gradients of critic model wrt the actions taken by the actor model
        get_action_grads = K.function(
            [self.behavior_q.model.input[0], self.behavior_q.model.input[1]], 
            K.gradients(
                self.behavior_q.model.output, [self.behavior_q.model.input[1]]
            ))
        
        action_grads = get_action_grads([states, acts])
        
        # apply gradients 
        self.behavior_pi_AdamOpt([batch.state, np.array(action_grads).reshape(-1, self.env.action_space.shape[0])])
        
        
            
    def learn(self, total_timesteps):
        st = self.env.reset()
        assert self.learning_starts < total_timesteps
        
        for tt in range(total_timesteps):
            
            self.update_dictionary()
            self._current_timestep = tt
            st = self.act(st)
            
            if(self.memory.can_sample(self.batch_size) and tt > self.learning_starts):
                self.update_on_batch()   
                if (tt+1)%self.update_timesteps == 0:
                    self.transfer_weights()
                    
            
            if (tt+1)%self.logging_step == 0:
                print(
                    f"Episodes: {self._num_episodes} \t Average Reward {self._mean_eps_rew:.4f}",
                    f"\t Timesteps: {tt+1:.1f}"
                )
                    
            if self.epsilon > self.epsilon_min and tt < (self.exploration_fraction*total_timesteps):
                self.epsilon *= self.epsilon_decay
                        
        self.env.close()

In [11]:
env = gym.make("MountainCarContinuous-v0")
env._max_episode_steps = 256
model = DDPGAgent(env, Identity, learning_starts=0, logger_steps=200)



In [12]:
model.learn(2000)

Episodes: 1 	 Average Reward 0.0000 	 Timesteps: 200.0
Episodes: 2 	 Average Reward -4.9249 	 Timesteps: 400.0
Episodes: 3 	 Average Reward -4.1968 	 Timesteps: 600.0
Episodes: 4 	 Average Reward -4.0091 	 Timesteps: 800.0
Episodes: 4 	 Average Reward -4.0091 	 Timesteps: 1000.0


KeyboardInterrupt: 