[medium](https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b)/ [github](https://github.com/thechrisyoon08/Reinforcement-learning/tree/master/DDPG)<br>
[paper](https://arxiv.org/pdf/1509.02971.pdf)<br>
[D4PG](https://github.com/msinto93/D4PG) <br>

In [20]:
%reload_ext autoreload
%autoreload 2
import torch
import torch.nn.functional as F 
import random
import numpy as np
from EXITrl.approx_v_base import ApproxVBase
from EXITrl.approx_policy_base import ApproxPolicyBase
from EXITrl.base import Base
from EXITrl.helpers import print_weight_size, copy_params, update_params, ExperienceReplay, convert_to_tensor
from EXITrl.nn_wrapper import NNWrapper
import gym

In [21]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def _action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def _reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.


In [22]:
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

In [23]:
class Critic(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Critic, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, state, action):
        """
        Params state and actions are torch tensors
        """
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        return x

class Actor(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, learning_rate = 3e-4):
        super(Actor, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))

        return x

In [30]:
class DDPG(Base):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 1. Critic
        self.critic = NNWrapper(
            model=Critic(self.num_state + self.num_action, 256, self.num_action),
            lr=self.beta
        )
        self.critic_target = NNWrapper(
            model=Critic(self.num_state + self.num_action, 256, self.num_action),
            lr=self.beta
        )
        copy_params(self.critic.model, self.critic_target.model)
        
        # 2. Actor
        self.actor = NNWrapper(
            model=Actor(self.num_state, 256, self.num_action),
            lr=self.alpha
        )
        self.actor_target = NNWrapper(
            model=Actor(self.num_state, 256, self.num_action),
            lr=self.alpha
        )
        copy_params(self.actor.model, self.actor_target.model)
        
        # init
        self.mse_loss = torch.nn.MSELoss(reduction='mean')
        self.experience_replay = ExperienceReplay(num_experience=2048)
        self.noise = OUNoise(self.env.action_space)

    def _loop(self, episode) -> int:
        done = False
        total_reward, reward = 0, 0
        state = self.env.reset()
        self.noise.reset()
        for i in range(1000):
            action = self.actor.forward(convert_to_tensor(state)).detach().numpy()
            action = self.noise.get_action(action, i)
            _state, reward, done, _ = self.env.step(action)
            self.experience_replay.remember(state, action, reward, _state, done)
            
            batch_state, \
            batch_action, \
            batch_reward, \
            batch_next_state, \
            batch_done = self.experience_replay.recall(batch_size=512)

            batch_next_actions = self.actor_target.forward(batch_next_state)
            next_Q = self.critic_target.forward(batch_next_state, batch_next_actions.detach())
            Q_prime = batch_reward + self.gamma * next_Q
            Q = self.critic.forward(batch_state, batch_action)
            
            critic_loss = self.mse_loss(Q_prime, Q)
            self.critic.backprop(critic_loss)
            actor_loss = -self.critic.forward(batch_state, self.actor.forward(batch_state)).mean()
            self.actor.backprop(actor_loss)
            
            update_params(self.critic.model, self.critic_target.model, 1e-2)
            update_params(self.actor.model, self.actor_target.model, 1e-2)
            
            total_reward += reward
            state = _state
            if done: return total_reward
            
            
try: env.close()
except: pass
env = NormalizedEnv(gym.make('Pendulum-v0'))
ddpg = DDPG(env, 
           num_episodes=50,
           policy="gaussian_policy",
           alpha=0.0001, 
           beta=0.001,
           gamma=.99)
ddpg.train(True)

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0 reward: -1251.4382782759806
episode: 1 reward: -1526.519256230102
episode: 2 reward: -1416.872703367815
episode: 3 reward: -1537.560381086706
episode: 4 reward: -987.202134913785
episode: 5 reward: -1552.3115435719662
episode: 6 reward: -1600.6971691524484
episode: 7 reward: -1508.9987200067712
episode: 8 reward: -1578.4174620312285
episode: 9 reward: -1204.978783705949
episode: 10 reward: -1598.3711877555252
episode: 11 reward: -1561.0498562929067
episode: 12 reward: -1531.6788382185205
episode: 13 reward: -1501.1720899181312
episode: 14 reward: -1541.9494440988146
episode: 15 reward: -1557.0434477663493
episode: 16 reward: -1488.3725891464167
episode: 17 reward: -1403.5415597500923
episode: 18 reward: -1491.290511081439
episode: 19 reward: -1477.2802512184326
e

KeyboardInterrupt: 

In [32]:
env = gym.make('Pendulum-v0')
env.observation_space, env.action_space

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


(Box(3,), Box(1,))

In [34]:
env = gym.make('CartPole-v1')
env.observation_space, env.action_space

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


(Box(4,), Discrete(2))