[ref1](https://towardsdatascience.com/deep-deterministic-policy-gradients-explained-2d94655a9b7b)<br>
[ref2](https://github.com/msinto93/D4PG) <br>
[paper](https://arxiv.org/pdf/1509.02971.pdf)

In [2]:
%reload_ext autoreload
%autoreload 2
import torch
import torch.nn.functional as F 
import random
import numpy as np
from EXITrl.approx_v_base import ApproxVBase
from EXITrl.approx_policy_base import ApproxPolicyBase
from EXITrl.base import Base
from EXITrl.helpers import print_weight_size, copy_params, ExperienceReplay, convert_to_tensor
from EXITrl.nn_wrapper import NNWrapper
import gym

In [3]:
class Critic(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Critic, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, state, action):
        """
        Params state and actions are torch tensors
        """
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        return x

class Actor(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, learning_rate = 3e-4):
        super(Actor, self).__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)
        
    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))

        return x

In [5]:
class DDPG(Base):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 1. Actor
        self.actor = NNWrapper(
            model=Actor(self.num_state, 8, self.num_action),
            lr=self.alpha
        )
        self.actor_target = NNWrapper(
            model=Actor(self.num_state, 8, self.num_action),
            lr=self.alpha
        )
        copy_params(self.actor.model, self.actor_target.model)
        
        # 2. Critic
        self.critic = NNWrapper(
            model=Critic(self.num_state + self.num_action, 8, self.num_action),
            lr=self.beta
        )
        self.critic_target = NNWrapper(
            model=Critic(self.num_state + self.num_action, 8, self.num_action),
            lr=self.beta
        )
        copy_params(self.critic.model, self.critic_target.model)
        
        self.mse_loss = torch.nn.MSELoss(reduction='mean')
        self.experience_replay = ExperienceReplay(num_experience=128)

    def _loop(self, episode) -> int:
        done = False
        total_reward, reward = 0, 0
        state = self.env.reset()
        for i in range(1000):
            action = self.actor.forward(convert_to_tensor(state)).detach().numpy()
            _state, reward, done, _ = self.env.step(action)
            self.experience_replay.remember(state, action, reward, _state, done)
            
            batch_state, \
            batch_action, \
            batch_reward, \
            batch_next_state, \
            batch_done = self.experience_replay.recall(batch_size=64)

            Q = self.critic.forward(batch_state, batch_action)
            batch_next_actions = self.actor_target.forward(batch_next_state)
            next_Q = self.critic_target.forward(batch_next_state, batch_next_actions)
            Qprime = batch_reward + self.gamma * next_Q
            
            actor_loss = -self.critic.forward(batch_state, self.actor.forward(batch_state)).mean()
            self.actor.backprop(actor_loss)
            
            critic_loss = self.mse_loss(Q, Qprime)
            self.critic.backprop(critic_loss)

#             if done: 
#                 td_target = reward
#             else: 
#                 td_target = reward + self.gamma * self.get_v(_state)
#             estimate_v = self.get_v(state)
#             td_error = td_target - estimate_v
            
#             self.update_v(td_target, self.get_v(state))
#             loss = (-log_prob) * td_error # using the td error as our advantage estimate
#             self.update_policy(loss)

            total_reward += reward
#             state = _state
            if done: return total_reward
            
            
try: env.close()
except: pass
env = gym.make('Pendulum-v0')
ddpg = DDPG(env, 
           num_episodes=50,
           policy="gaussian_policy",
           alpha=0.0001, 
           beta=0.001,
           gamma=.99)
ddpg.train(True)

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0 reward: -869.5294897919525
episode: 1 reward: -1213.6850117463475
episode: 2 reward: -1179.1146833504367
episode: 3 reward: -1801.426573079381
episode: 4 reward: -1282.8330809296244
episode: 5 reward: -1067.1669709850235
episode: 6 reward: -1173.2220609376172
episode: 7 reward: -878.3578162135501
episode: 8 reward: -1071.3578565755615
episode: 9 reward: -1854.4712313556806
episode: 10 reward: -1208.932592005125
episode: 11 reward: -1377.81448622766
episode: 12 reward: -1122.546471065339
episode: 13 reward: -1826.5581714843415


KeyboardInterrupt: 

In [433]:
batch_state = torch.Tensor([[1,2,3]])
batch_action = torch.Tensor([[4]])
torch.cat([batch_state, batch_action], 1)

tensor([[1., 2., 3., 4.]])

In [6]:

memory = ExperienceReplay(2)
memory.remember([1,10],2,3,False)
memory.remember([11,110],22,33, True)
memory.remember([111,1110],222,333, False
memory.remember([1111,11110],2222,3333, True)
memory.recall(2)
memory.memories[0], memory.memories[0][[2,3]]

(tensor([[    1.,    10.],
         [   11.,   110.],
         [  111.,  1110.],
         [ 1111., 11110.]]), tensor([[  111.,  1110.],
         [ 1111., 11110.]]))

In [400]:
memories = []
memories.append([[1, 10]])
memories[0] = np.append(memories[0], [[11, 110]], axis=0)
print(memories[0])
memories[0] = np.append(memories[0], [[111, 1110]], axis=0)
memories

[[  1  10]
 [ 11 110]]


[array([[   1,   10],
        [  11,  110],
        [ 111, 1110]])]

In [148]:
a = np.array([-0.70774644 -0.70646655 -0.45610168])
b = [a]
torch.from_numpy(np.array(b))

tensor([[-1.8703]], dtype=torch.float64)

In [206]:
from collections import deque
memories = []
memories.append( (np.array([1.1, 2, 3]), False))
memories

[(array([1.1, 2. , 3. ]), False)]

In [253]:
memories = []
memories.append((np.array([1.1, 2, 3]), False))
memories = np.array(memories)
num_arg = memories[0].shape[0]
output = list([memories[:, i] for i in range(num_arg)])
output

[array([array([1.1, 2. , 3. ])], dtype=object), array([False], dtype=object)]

In [309]:
a = np.array([False])
 
newArray = np.append(a, [False], axis = 0)
 
print(newArray)

[False False]
