[medium](https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b)/ [github](https://github.com/thechrisyoon08/Reinforcement-learning/tree/master/DDPG)<br>
[paper](https://arxiv.org/pdf/1509.02971.pdf)<br>
[D4PG](https://github.com/msinto93/D4PG) <br>

### TODO
- this repo they add batchnorm [link](https://github.com/floodsung/DDPG/blob/master/critic_network_bn.py)

In [1]:
%reload_ext autoreload
%autoreload 2
import torch
import torch.nn.functional as F 
import random
import numpy as np
from EXITrl.base import Base
from EXITrl.helpers import print_weight_size, copy_params, update_params, ExperienceReplay, convert_to_tensor, device
from EXITrl.nn_wrapper import NNWrapper
import gym

In [2]:
class NormalizedEnv(gym.ActionWrapper):
    """ Wrap action """

    def _action(self, action):
        act_k = (self.action_space.high - self.action_space.low)/ 2.
        act_b = (self.action_space.high + self.action_space.low)/ 2.
        return act_k * action + act_b

    def _reverse_action(self, action):
        act_k_inv = 2./(self.action_space.high - self.action_space.low)
        act_b = (self.action_space.high + self.action_space.low)/ 2.


In [3]:
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_action(self, action, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

In [4]:
class Critic(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Critic, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, state, action):
        """
        Params state and actions are torch tensors
        """
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        return x

class Actor(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, learning_rate = 3e-4):
        super(Actor, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))

        return x

In [8]:
class DDPG(Base):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 1. Critic
        self.critic = NNWrapper(
            model=Critic(self.num_state + self.num_action, 256, self.num_action),
            lr=self.beta
        )
        self.critic_target = NNWrapper(
            model=Critic(self.num_state + self.num_action, 256, self.num_action),
            lr=self.beta
        )
        copy_params(self.critic.model, self.critic_target.model)
        
        # 2. Actor
        self.actor = NNWrapper(
            model=Actor(self.num_state, 256, self.num_action),
            lr=self.alpha
        )
        self.actor_target = NNWrapper(
            model=Actor(self.num_state, 256, self.num_action),
            lr=self.alpha
        )
        copy_params(self.actor.model, self.actor_target.model)
        
        # init
        self.mse_loss = torch.nn.MSELoss(reduction='mean')
        self.experience_replay = ExperienceReplay(num_experience=2048, num_recall=64)
        self.noise = OUNoise(self.env.action_space)

    def _loop(self, episode) -> int:
        done = False
        total_reward, reward = 0, 0
        state = self.env.reset()
        self.noise.reset()
        for i in range(1000):
            action = self.actor.forward(convert_to_tensor(state)).detach().numpy()
            action = self.noise.get_action(action, i)
            _state, reward, done, _ = self.env.step(action)
            self.experience_replay.remember(state, action, reward, _state, done)
            
            batch_state, \
            batch_action, \
            batch_reward, \
            batch_next_state, \
            batch_done = self.experience_replay.recall()

            batch_next_actions = self.actor_target.forward(batch_next_state)
            next_Q = self.critic_target.forward(batch_next_state, batch_next_actions.detach())
            Q_prime = batch_reward + self.gamma * next_Q
            Q = self.critic.forward(batch_state, batch_action)
            
            critic_loss = self.mse_loss(Q_prime, Q)
            self.critic.backprop(critic_loss)
            actor_loss = -self.critic.forward(batch_state, self.actor.forward(batch_state)).mean()
            self.actor.backprop(actor_loss)
            
            update_params(self.critic.model, self.critic_target.model, 1e-2)
            update_params(self.actor.model, self.actor_target.model, 1e-2)
            
            total_reward += reward
            state = _state
            if done: return total_reward
            
    def _save(self, reward):
        torch.save({
            'critic': self.critic.model.state_dict(),
            'actor': self.actor.model.state_dict(),
        }, self.save_name)
        
    def _load(self):
        checkpoint = torch.load(self.save_name, map_location=device)
        self.critic.model.load_state_dict(checkpoint['critic'])
        self.actor.model.load_state_dict(checkpoint['actor'])
        
    def play(self, num_episode=3):
        self.critic.model.eval()
        self.actor.model.eval()
        super().play(num_episode)
        
    def policy(self, state):
        return self.actor.forward(convert_to_tensor(state)).detach().numpy()

            
try: env.close()
except: pass
env = NormalizedEnv(gym.make('Pendulum-v0'))
ddpg = DDPG(env, 
           num_episodes=150,
           alpha=0.0001, 
           beta=0.001,
           gamma=.99,
           save_name="checkpoint/Pendulum-v0-DDPG.pth")
ddpg.train(True)

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 10	Average Score: -1642.09 	other{}
Episode 20	Average Score: -1641.29 	other{}
Episode 30	Average Score: -1626.73 	other{}
Episode 40	Average Score: -1293.75 	other{}
Episode 50	Average Score: -1057.42 	other{}
Episode 60	Average Score: -1236.38 	other{}
Episode 70	Average Score: -1182.17 	other{}
Episode 80	Average Score: -1213.19 	other{}
Episode 90	Average Score: -1251.68 	other{}
Episode 100	Average Score: -1297.96 	other{}
Episode 110	Average Score: -1265.13 	other{}
Episode 120	Average Score: -1220.77 	other{}
Episode 130	Average Score: -1555.55 	other{}
Episode 140	Average Score: -1568.29 	other{}
Episode 150	Average Score: -1566.55 	other{}


In [6]:
ddpg.play()

[33mWARN: <class '__main__.NormalizedEnv'> doesn't implement 'action' method. Maybe it implements deprecated '_action' method.[0m
Episode 1	Average Score: -898.49
Episode 2	Average Score: -1002.51
Episode 3	Average Score: -927.90


In [7]:
env.close()