# Zephyrus RL

In [1]:
import tensorflow as tf

In [3]:
pip install -q gym

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install tf_agents

Collecting tf_agentsNote: you may need to restart the kernel to use updated packages.

  Using cached tf_agents-0.6.0-py3-none-any.whl (1.1 MB)
Collecting cloudpickle==1.3
  Using cached cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Collecting gin-config>=0.3.0
  Using cached gin_config-0.4.0-py2.py3-none-any.whl (46 kB)
Collecting tensorflow-probability>=0.11.0
  Using cached tensorflow_probability-0.12.1-py2.py3-none-any.whl (4.8 MB)
Collecting dm-tree
  Downloading dm_tree-0.1.5-cp36-cp36m-win_amd64.whl (85 kB)
Installing collected packages: dm-tree, cloudpickle, tensorflow-probability, gin-config, tf-agents
  Attempting uninstall: cloudpickle
    Found existing installation: cloudpickle 1.6.0
    Uninstalling cloudpickle-1.6.0:
      Successfully uninstalled cloudpickle-1.6.0
Successfully installed cloudpickle-1.3.0 dm-tree-0.1.5 gin-config-0.4.0 tensorflow-probability-0.12.1 tf-agents-0.6.0


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

tf.compat.v1.enable_v2_behavior()



In [18]:
class zCarController(py_environment.PyEnvironment):
    
    #generate random seed
    #def create_seed(a=None, max_bytes=8):
    #    if a is None:
    #        a = _bigint_from_bytes(os.urandom(max_bytes))
    #    elif isinstance(a, str):
    #        a = a.encode('utf8')
    #        a += hashlib.sha512(a).digest()
    #        a = _bigint_from_bytes(a[:max_bytes])
    #    elif isinstance(a, int):
    #        a = a % 2**(8 * max_bytes)
    #    else:
    #        raise error.Error('Invalid type for seed: {} ({})'.format(type(a), a))

    #    return a
    
    def __init__(self):
        self.episode_ended = False

        #time constant
        self.tau = 0.02

        #distance between front and back wheels
        self.d = 0.09
        #back wheel radius
        self.r = 0.015
        #back wheel angular velocity
        self.ws_max_radians = 10 * 2 * np.pi / 360

        self.kinematics_integrator = 'euler'

        # Initial angle and acceleration at which to fail the episode
        self.theta_threshold_radians = 5 * 2 * np.pi / 360
        self.acc_threshold = 2.5

        high = np.array([self.acc_threshold * 2,
                        np.finfo(np.float32).max,
                        self.theta_threshold_radians * 2,
                        np.finfo(np.float32).max],

                        dtype=np.float32)

        #action space definition
        self.action_space = array_spec.BoundedArraySpec(
            shape=(2,), dtype=np.int_, minimum=[-100, -100], maximum=[100, 100], name='action')
        #observation space definition
        self.observation_space = array_spec.BoundedArraySpec(
            shape=(2,), dtype=np.float32, minimum=[-self.acc_threshold, -self.theta_threshold_radians], maximum=[self.acc_threshold, self.theta_threshold_radians], name='observation')

        #define seed, current state variables
        self.seed()
        state = (.0, .0, .0, .0)

    #reset episode, load new random initial values
    def _reset(self):
        state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        return np.array(state)
    
    def seed(self, seed=None):
        seed = 42 #create_seed(seed)
        rng = np.random.RandomState()
        self.np_random = rng
        return [seed]
    
    #get current action
    def action_spec(self):
        return self.action_space
    
    #get current observation
    def observation_spec(self):
        return self.observation_space
    
    #step calculations
    def _step(self, action):
        #err_msg = "%r (%s) invalid" % (action, type(action))
        #assert self.action_space.contains(action), err_msg
    
        #update state variables
        x, x_dot, theta, theta_dot = state

        cos_theta = math.cos(theta)
        sin_theta = math.sin(theta)

        vs = self.ws * self.r
        vx = vs * cos_theta
        vy = 0


        if self.kinematics_integrator == 'euler':
            x = x + self.tau * x_dot
            x_dot = vs * cos_theta
            theta = theta + self.tau * theta_dot
            theta_dot =(vs / self.d) * sin_theta

            state = (x, x_dot, theta, theta_dot)

        episode_ended = bool(
            x < -self.acc_threshold
            or x > self.acc_threshold
            or theta < -self.theta_threshold_radians
            or theta > self.theta_threshold_radians
        )

        if not episode_ended:
            reward = 1.0
        else:
            reward = 0.0

        if self.episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self._reset()

        return np.array(state), reward, done, {}

In [4]:
# Create the environment
environment = zCarController()

#utils.validate_py_environment(environment, episodes=5)
tf_env = tf_py_environment.TFPyEnvironment(environment)

In [5]:
import collections
import gym
import numpy as np
import tensorflow as tf
import tqdm

#from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [6]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [7]:
eps = np.finfo(np.float32).eps.item()

In [8]:
class ActorCritic(tf.keras.Model):
  #Combined actor-critic network.

    def __init__(self, num_actions: int, num_hidden_units: int): 
        super().__init__()

        self.common = layers.Dense(num_hidden_units, activation="relu")
        self.actor = layers.Dense(num_actions)
        self.critic = layers.Dense(1)

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.common(inputs)
        return self.actor(x), self.critic(x)

In [9]:
num_actions = 2
num_hidden_units = 128

model = ActorCritic(num_actions, num_hidden_units)

In [10]:
environment

<__main__.zCarController at 0x19451c33dd8>

In [11]:
def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    
    state, reward, done, _ = environment._step(action)
    return (state.astype(np.float32), 
          np.array(reward, np.int32), 
          np.array(done, np.int32))


def tf_env_step(action: tf.Tensor) -> List[tf.Tensor]:
    return tf.numpy_function(env_step, [action], 
                           [tf.float32, tf.int32, tf.int32])

In [12]:
def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> List[tf.Tensor]:

    action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

    initial_state_shape = initial_state.shape
    state = initial_state

    for t in tf.range(max_steps):
        # Convert state into a batched tensor (batch size = 1)
        state = tf.expand_dims(state, 0)

        # Run the model and to get action probabilities and critic value
        action_logits_t, value = model(state)

        # Sample next action from the action probability distribution
        action = tf.random.categorical(action_logits_t, 1)[0, 0]
        action_probs_t = tf.nn.softmax(action_logits_t)

        # Store critic values
        values = values.write(t, tf.squeeze(value))

        # Store log probability of the action chosen
        action_probs = action_probs.write(t, action_probs_t[0, action])

        # Apply action to the environment to get next state and reward
        state, reward, done = tf_env_step(action)
        state.set_shape(initial_state_shape)

        # Store reward
        rewards = rewards.write(t, reward)

        if tf.cast(done, tf.bool):
            break

    action_probs = action_probs.stack()
    values = values.stack()
    rewards = rewards.stack()

    return action_probs, values, rewards

In [13]:
def get_expected_return(
    rewards: tf.Tensor, 
    gamma: float, 
    standardize: bool = True) -> tf.Tensor:

    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)

    # Start from the end of `rewards` and accumulate reward sums
    # into the `returns` array
    rewards = tf.cast(rewards[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward + gamma * discounted_sum
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]

    if standardize:
        returns = ((returns - tf.math.reduce_mean(returns)) / 
                   (tf.math.reduce_std(returns) + eps))

    return returns

In [14]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor) -> tf.Tensor:

    advantage = returns - values

    action_log_probs = tf.math.log(action_probs)
    actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

    critic_loss = huber_loss(values, returns)

    return actor_loss + critic_loss

In [15]:
rank_1_tensor = tf.constant([0.0, 0.0, 0.0, 0.0])
print(rank_1_tensor)

tf.Tensor([0. 0. 0. 0.], shape=(4,), dtype=float32)


In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


@tf.function
def train_step(
    initial_state: tf.Tensor, 
    model: tf.keras.Model, 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    max_steps_per_episode: int) -> tf.Tensor:
    
    #Runs a model training step.
    with tf.GradientTape() as tape:

        # Run the model for one episode to collect training data
        action_probs, values, rewards = run_episode(
            rank_1_tensor, model, max_steps_per_episode) 
        
        # Calculate expected returns
        returns = get_expected_return(rewards, gamma)

        # Convert training data to appropriate TF tensor shapes
        action_probs, values, returns = [
            tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

        # Calculating loss values to update our network
        loss = compute_loss(action_probs, values, returns)

    # Compute the gradients from the loss
    grads = tape.gradient(loss, model.trainable_variables)

    # Apply the gradients to the model's parameters
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    episode_reward = tf.math.reduce_sum(rewards)

    return episode_reward

In [17]:
%%time

max_episodes = 10000
max_steps_per_episode = 1000

# Is considered solved if average reward is >= 195 over 100 
# consecutive trials
reward_threshold = 195
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

with tqdm.trange(max_episodes) as t:
    for i in t:
        initial_state = tf.constant(environment._reset(), dtype=tf.float32)
        episode_reward = int(train_step(
            initial_state, model, optimizer, gamma, max_steps_per_episode))

        running_reward = episode_reward*0.01 + running_reward*.99

        t.set_description(f'Episode {i}')
        t.set_postfix(
            episode_reward=episode_reward, running_reward=running_reward)

        # Show average episode reward every 10 episodes
        if i % 10 == 0:
            pass 

        if running_reward > reward_threshold:  
            break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

  0%|                                                                                        | 0/10000 [00:01<?, ?it/s]


UnknownError:  UnboundLocalError: local variable 'state' referenced before assignment
Traceback (most recent call last):

  File "D:\Anaconda\envs\tensorflow\lib\site-packages\tensorflow\python\ops\script_ops.py", line 249, in __call__
    ret = func(*args)

  File "D:\Anaconda\envs\tensorflow\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 620, in wrapper
    return func(*args, **kwargs)

  File "<ipython-input-11-5505a9452fa4>", line 3, in env_step
    state, reward, done, _ = environment._step(action)

  File "<ipython-input-3-36ee5c0c50b1>", line 80, in _step
    x, x_dot, theta, theta_dot = state

UnboundLocalError: local variable 'state' referenced before assignment


	 [[{{node while/body/_1/while/PyFunc}}]] [Op:__inference_train_step_1983]

Function call stack:
train_step
