## Import necessary packages tutorial

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import renom as rm
from renom.utility.initializer import Uniform, GlorotUniform
from renom_rl.continuous.ddpg import DDPG
from renom_rl.environ.openai import Pendulum


## Deifne the environment tutorial

In [2]:
env = Pendulum()

## Define the actor network tutorial

In [3]:
class Actor(rm.Model):
    
    '''Here considered 3-layer network (excluding input layer). Feel free to change the network depth 
    and check the results. The output-layer number of nuerons are equal to number of actions.
    In the example of OpenAI Gym's Pendlum-v0 environment number of actions are 1.'''
    
    def __init__(self, layer_size):
        self._layers = []
        self.action_size = 1
        self.high = 2.
        self._l1 = rm.Dense(layer_size[0], initializer=GlorotUniform())
        self._l2 = rm.Dense(layer_size[1], initializer=GlorotUniform())
        self._l3 = rm.Dense(self.action_size, initializer=Uniform(min=-0.003, max=0.003))
    
    def forward(self, x):
        '''Neural Network inputs are state information, outputs are actions. '''
        h1 = rm.relu(self._l1(x))
        h2 = rm.relu(self._l2(h1))
        h3 = rm.tanh(self._l3(h2)) 
        h = h3*self.high
        return h


# Define the Critic network tutorial

In [4]:
class Critic(rm.Model):
    '''Here considered a 3-layer network (input layer, hidden layer-1, hidden layer-2, output layer)
        At input-layer state information, 2nd-hidden layer actions are applied, last layer has a single neuron'''
    def __init__(self, layer_size):        
        self._layers = []
        self._l1 = rm.Dense(layer_size[0], initializer=GlorotUniform(), weight_decay=0.01)
        self._l2 = rm.Dense(layer_size[1], initializer=GlorotUniform(), weight_decay=0.01)
        self._l3 = rm.Dense(1, initializer=Uniform(min=-0.0003, max=0.0003), weight_decay=0.01)
  
    def forward(self, x, action):
        '''Q(s,a) calculation for a given (state, action) pair'''
        h1 = rm.relu(self._l1(x))
        h2 = rm.relu(self._l2(rm.concat(h1, action))) # actions are applied at 2nd hidden layer
        h = self._l3(h2)        
        return h
    

## Instantiating the actor & ciritic networks 

In [5]:
layer_size = [400, 300] # two-hidden layers dimension
actor_network = Actor(layer_size=layer_size)
critic_network = Critic(layer_size=layer_size)

## Instantiating the DDPG class

In [6]:
ddpg = DDPG(env, actor_network, critic_network)

## Training 

In [7]:
ddpg.fit(episode=100, episode_step=200, exploration_step=20000, min_exploration_rate=0.001, max_exploration_rate=1.0, test_step=200)

episode 001 avg_loss: 4.103 total_reward [train:-1166.750 test:-] exploration:0.990: 100%|██████████| 200/200 [00:03<00:00, 56.09it/s]
episode 002 avg_loss: 0.157 total_reward [train:-1386.071 test:-] exploration:0.980: 100%|██████████| 200/200 [00:06<00:00, 29.29it/s]
episode 003 avg_loss: 0.064 total_reward [train:-1498.102 test:-] exploration:0.970: 100%|██████████| 200/200 [00:05<00:00, 34.24it/s]
episode 004 avg_loss: 0.028 total_reward [train:-1256.605 test:-] exploration:0.960: 100%|██████████| 200/200 [00:04<00:00, 43.12it/s]
episode 005 avg_loss: 0.024 total_reward [train:-1691.817 test:-] exploration:0.950: 100%|██████████| 200/200 [00:05<00:00, 35.23it/s]
episode 006 avg_loss: 0.033 total_reward [train:-1910.309 test:-] exploration:0.940: 100%|██████████| 200/200 [00:05<00:00, 35.15it/s]
episode 007 avg_loss: 0.045 total_reward [train:-1499.202 test:-] exploration:0.930: 100%|██████████| 200/200 [00:07<00:00, 25.69it/s]
episode 008 avg_loss: 0.056 total_reward [train:-1627.8

episode 061 avg_loss:10.126 total_reward [train:-394.436 test:-] exploration:0.391: 100%|██████████| 200/200 [00:06<00:00, 27.70it/s]
episode 062 avg_loss:13.227 total_reward [train:-567.400 test:-] exploration:0.381: 100%|██████████| 200/200 [00:05<00:00, 37.55it/s]
episode 063 avg_loss: 8.321 total_reward [train:-396.226 test:-] exploration:0.371: 100%|██████████| 200/200 [00:05<00:00, 38.47it/s]
episode 064 avg_loss: 9.313 total_reward [train:-641.517 test:-] exploration:0.361: 100%|██████████| 200/200 [00:05<00:00, 34.58it/s]
episode 065 avg_loss: 9.572 total_reward [train:-538.943 test:-] exploration:0.351: 100%|██████████| 200/200 [00:06<00:00, 33.01it/s]
episode 066 avg_loss: 7.657 total_reward [train:-395.497 test:-] exploration:0.341: 100%|██████████| 200/200 [00:06<00:00, 31.69it/s]
episode 067 avg_loss:12.014 total_reward [train:-499.647 test:-] exploration:0.331: 100%|██████████| 200/200 [00:05<00:00, 28.53it/s]
episode 068 avg_loss:12.024 total_reward [train:-396.999 test:

## Testing

In [13]:
test_reward_list = ddpg.test(render=False)
print(test_reward_list)

-1659.56423196


## Plotting the test result

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(test_reward_list)
plt.xlabel("Episodes")
plt.ylabel("Total Rewards per Episode")