## Import necessary packages tutorial

In [1]:
import gym
import renom as rm
from renom.utility.initializer import Uniform, GlorotUniform
from renom_rl.continuous.a3c_2 import A3C
from renom_rl.env import BaseEnv
from renom.utility.initializer import Gaussian

## Deifne the environment tutorial

In [2]:
env = gym.make('Pendulum-v0')
print(env.action_space.shape)
print(env.observation_space.shape)

class CustomEnv(BaseEnv):
    
    def __init__(self, env):
        self.env = env
        self.action_shape = (1, )
        self.state_shape = (3, )
        super(CustomEnv, self).__init__()
    
    def reset(self):
        return self.env.reset()
    
    def sample(self):
        return self.env.action_space.sample()
    
    def render(self):
        self.env.render()

    def step(self, action):
        return self.env.step(action)[:3]

custom_env = CustomEnv(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
(1,)
(3,)


## Define the actor network tutorial

## Instantiating the actor & ciritic networks 

In [3]:
class AC(rm.Model):
    
    def __init__(self):
        bias = False
        self._dn1 = rm.Dense(100, ignore_bias=bias)
        self._dn2 = rm.Dense(100, ignore_bias=bias)
        self._dn_a = rm.Dense(2, ignore_bias=bias)
        self._dn_c = rm.Dense(1, ignore_bias=bias)

    def forward(self, x):
        h = rm.relu(self._dn1(x))
        h = rm.relu(self._dn2(h))
        p = self._dn_a(h)
        v = self._dn_c(h)
        return p, v
    
nn_model = AC()

## Instantiating the DDPG class

In [4]:
ddpg = A3C(custom_env, nn_model, loss_func=rm.mse, num_worker=16)

## Training 

In [5]:
ddpg.fit(episode=10000, episode_step=2000, test_frequency=16*5)

0000 Average Train reward: -1292.000 Test reward: -1496.317: : 80it [01:33,  1.17s/it]
0001 Average Train reward: -1279.227 Test reward: -824.983: : 80it [01:34,  3.40it/s]
0002 Average Train reward: -1033.457 Test reward: -1491.098: : 80it [01:35,  1.19s/it]
0003 Average Train reward: -1419.982 Test reward: -1388.784: : 80it [01:35,  1.19s/it]
0004 Average Train reward: -1274.359 Test reward: -1548.332: : 80it [01:33,  1.17s/it]
0005 Average Train reward: -1164.958 Test reward: -1489.267: : 80it [01:33,  2.48it/s]
0006 Average Train reward: -1165.232 Test reward: -1483.449: : 80it [01:34,  4.11it/s]
0007 Average Train reward: -1075.862 Test reward: -1478.427: : 80it [01:34,  1.18s/it]
0008 Average Train reward: -1284.431 Test reward: -1651.784: : 80it [01:33,  3.94it/s]
0009 Average Train reward: -1394.216 Test reward: -1648.264: : 80it [01:33,  2.61it/s]
0010 Average Train reward: -1054.183 Test reward: -1127.873: : 80it [01:33,  1.17s/it]
0011 Average Train reward: -1251.479 Test re

KeyboardInterrupt: 

70it [01:32,  1.83s/it]

## Testing

In [None]:
test_reward_list = ddpg.test(render=True)
print(test_reward_list)

## Plotting the test result

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(test_reward_list)
plt.xlabel("Episodes")
plt.ylabel("Total Rewards per Episode")

In [None]:
i