In [1]:
import gym
import rospy
import numpy as np
import tensorflow as tf

from gym.envs.registration import register
from neuroracer_gym import neuroracer_env
from tf_agents.environments import tf_py_environment, utils
from tf_agents.networks import q_network
from tf_agents.networks import actor_distribution_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.utils import common
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import time_step as ts
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer

# just to register env:
from neuroracer_gym.tasks.neuroracer_discrete_task import NeuroRacerTfAgents

None


In [2]:
rospy.init_node('neuroracer_qlearn', anonymous=True, log_level=rospy.INFO)

env = NeuroRacerTfAgents()
env_eval = NeuroRacerTfAgents(val=True)

print('action_spec:', env.action_spec())
print('time_step_spec.observation:', env.time_step_spec().observation)
print('time_step_spec.step_type:', env.time_step_spec().step_type)
print('time_step_spec.discount:', env.time_step_spec().discount)
print('time_step_spec.reward:', env.time_step_spec().reward)

[ERROR] [1609759330.691125, 0.000000]: NOT Initialising Simulation Physics Parameters
[WARN] [1609759330.697017, 0.006000]: Start Init ControllersConnection
[WARN] [1609759330.698069, 0.006000]: END Init ControllersConnection
[ERROR] [1609759333.172889, 2.470000]: NOT Initialising Simulation Physics Parameters
[WARN] [1609759333.177111, 2.470000]: Start Init ControllersConnection
[WARN] [1609759333.178668, 2.470000]: END Init ControllersConnection


action_spec: BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=2)
time_step_spec.observation: BoundedArraySpec(shape=(30,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=10.0)
time_step_spec.step_type: ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')
time_step_spec.discount: BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0)
time_step_spec.reward: ArraySpec(shape=(), dtype=dtype('float32'), name='reward')


In [3]:
utils.validate_py_environment(env, episodes=1)

Collided, Cumulated Reward: 109, n_steps: 34


In [4]:
time_step = env.reset()

for _ in range(1):
    time_step = env.step(np.array(2, dtype=np.int32))
    print(time_step.reward)

cumulative_reward = time_step.reward

-0.5


In [5]:
print(env.observation_spec())
print(env.action_spec())

BoundedArraySpec(shape=(30,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=10.0)
BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=2)


In [6]:
env = tf_py_environment.TFPyEnvironment(env)
env_eval = tf_py_environment.TFPyEnvironment(env_eval)

fc_layer_params = (128, )
dropout_layer_params = (0.15, )

q_net = q_network.QNetwork(
    env.observation_spec(),
    env.action_spec(),
    fc_layer_params=fc_layer_params,
    dropout_layer_params=dropout_layer_params)

In [7]:
fc_layer_params = (128,)
actor_net = actor_distribution_network.ActorDistributionNetwork(
    env.observation_spec(),
    env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    env.time_step_spec(),
    env.action_spec(),
    q_network=q_net,
    n_step_update=1,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    gradient_clipping=1.0,
    train_step_counter=train_step_counter)

agent.initialize()

print(env.batch_size)
print(agent.collect_data_spec)

In [8]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)

train_step_counter = tf.compat.v2.Variable(0)

agent = reinforce_agent.ReinforceAgent(
    env.time_step_spec(),
    env.action_spec(),
    actor_network=actor_net,
    optimizer=optimizer,
    normalize_returns=True,
    train_step_counter=train_step_counter,
    debug_summaries=True,
    summarize_grads_and_vars=True)
agent.initialize()

print(env.batch_size)
print(agent.collect_data_spec)

1
Trajectory(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), observation=BoundedTensorSpec(shape=(30,), dtype=tf.float32, name='observation', minimum=array(0., dtype=float32), maximum=array(10., dtype=float32)), action=BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32)), policy_info=(), next_step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)))


In [9]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


# Please also see the metrics module for standard implementations of different
# metrics.

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=env.batch_size,
    max_length=2000)

def collect_episode(environment, policy, num_episodes):

    episode_counter = 0
    environment.reset()

    while episode_counter < num_episodes:
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step, next_time_step)

        # Add trajectory to the replay buffer
        replay_buffer.add_batch(traj)

        if traj.is_boundary():
            episode_counter += 1


# This loop is so common in RL, that we provide standard implementations of
# these. For more details see the drivers module.

In [None]:
# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(env, agent.policy, 2)
returns = [avg_return]
losses = []
print('Starting training...')
for _ in range(1000):
    
    # Collect a few episodes using collect_policy and save to the replay buffer.
    collect_episode(env, agent.collect_policy, 2)

    # Use data from the buffer and update the agent's network.
    experience = replay_buffer.gather_all()
    train_loss = agent.train(experience)
    replay_buffer.clear()
        
    losses.append(train_loss.loss.numpy())

    step = agent.train_step_counter.numpy()

    if step % 1 == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))

    if step % 20 == 0:
        avg_return = compute_avg_return(env, agent.policy, 2)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

Collided, Cumulated Reward: 162, n_steps: 28
Collided, Cumulated Reward: 147, n_steps: 42
Starting training...
Collided, Cumulated Reward: 138, n_steps: 41
Collided, Cumulated Reward: 77, n_steps: 24
step = 1: loss = -0.818091094493866
Collided, Cumulated Reward: 160, n_steps: 46
Collided, Cumulated Reward: 199, n_steps: 60
step = 2: loss = 0.6095919609069824
Collided, Cumulated Reward: 229, n_steps: 65
Collided, Cumulated Reward: 101, n_steps: 31
step = 3: loss = 1.991593599319458
Collided, Cumulated Reward: 211, n_steps: 64
Collided, Cumulated Reward: 96, n_steps: 27
step = 4: loss = 1.2532529830932617
Collided, Cumulated Reward: 148, n_steps: 43
Collided, Cumulated Reward: 150, n_steps: 44
step = 5: loss = 0.7624613046646118
Collided, Cumulated Reward: 224, n_steps: 67
Collided, Cumulated Reward: 114, n_steps: 36
step = 6: loss = 0.008882641792297363
Collided, Cumulated Reward: 188, n_steps: 59
Collided, Cumulated Reward: 141, n_steps: 42
step = 7: loss = -1.25008225440979
Collided,

In [None]:
losses

def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    # observation = tf.ones((1080))
    # observation = tf.reshape(time_step.observation, [1080])
    # time_step = ts.restart(observation)
    # time_step = ts.restart(time_step.observation, 1)
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)


def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)


def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

random_policy = random_tf_policy.RandomTFPolicy(env.time_step_spec(),
                                                env.action_spec())

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=env.batch_size,
    max_length=1000)

agent.train_step_counter.assign(0)

class ExperienceReply(object):
    def __init__(self, agent, environment):
        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=environment.batch_size,
            max_length=50000)

        self._random_policy = RandomTFPolicy(environment.time_step_spec(),
                                             environment.action_spec())

        self._fill_buffer(environment, self._random_policy, steps=100)

        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=64,
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)

    def _fill_buffer(self, environment, policy, steps):
        for _ in range(steps):
            self.timestamp_data(environment, policy)

    def timestamp_data(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(time_step, action_step, next_time_step)

        self._replay_buffer.add_batch(timestamp_trajectory)

avg_return = compute_avg_return(env, agent.policy, 1)
returns = [avg_return]
losses = []
# iterator = iter(dataset)
experience_replay = ExperienceReply(agent, env)
for _ in range(150000):
    # Collect a few steps using collect_policy and save to the replay buffer.
    # collect_data(env, agent.collect_policy, replay_buffer, 1)
    # collect_data(env, random_policy, replay_buffer, 1)
    for _ in range(1):
        experience_replay.timestamp_data(env, agent.collect_policy)

    # Sample a batch of data from the buffer and update the agent's network.
    # experience, unused_info = next(iterator)
    experience, unused_info = next(experience_replay.iterator)
    
    train_loss = agent.train(experience).loss
    
    losses.append(train_loss.numpy())

    step = agent.train_step_counter.numpy()

    if step % 200 == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % 10000 == 0:
        avg_return = compute_avg_return(env_eval, agent.policy, 1)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
iterations = range(0, 10000 + 1, 1)
plt.plot(losses)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
#plt.ylim(top=10)

In [None]:
plt.plot(returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')

In [None]:
for _ in range(1000):
    experience_replay.timestamp_data(env, agent.collect_policy)

In [None]:

        
        # five_secs_ago = self.timer2 - rospy.Duration(5) # Time minus Duration is a Time
        if time.time() - self.timer2 > 30.0:
            if self._get_distance(self.last_pos, self._get_pos_x_y()) < 1.2:
                print('new break after: {}'.format(self._get_distance(self.last_pos, self._get_pos_x_y())))
                self._episode_ended = True
                if self.val:
                    reward = self._compute_dist_from_origin()
                else:
                    reward = -10.
                return ts.termination(np.array(self._state, dtype=np.float32), reward=reward)
            else:
                self.last_pos = self._get_pos_x_y()
            self.timer2 = time.time()

#### 