In [None]:
#!pip install gym
#!pip install --user tf-agents
#!pip install tensorflow
#!pip install ray

#!pip install stable_baselines3
#!pip install pyglet

#!pip install keras
#!pip install tf-agents

### Import gym library

In [54]:
import gym
import random
import numpy as np

In [55]:
ENV_NAME = 'CartPole-v0'

In [4]:
env = gym.make(ENV_NAME)

### Reset the enviornment to its initial state and return 4 values in array

![Observation](obs.png)

In [12]:
obs = env.reset()
obs

array([-0.03146594,  0.04136642, -0.0028272 , -0.04354905], dtype=float32)

### observation_space returns the information about the environment space
    # Box data type i.e. for continous observation
    # returned Box object represents
    # Box([[lower range of obs], [upper range obs], (number of dimensions), data type])
![](env.png)

In [13]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

### action_space returns the Discrete object
### sequence of integers, reprents the actions

![](action.png)

In [14]:
env.action_space

Discrete(2)

In [None]:
#env.render()

### Here a simple demonstartion of pushing cart to one side
### To run the cell below change cell's type from Raw to Code

### To take an action we need step() function
    # step function returns 4 parameters
    # [[env state], reward, terminal state(bool),]

### Importing the tensorflow environments
    # they generate tensors
    # and replay buffer can be used to train the agent

In [56]:
import tensorflow as tf
from tf_agents.environments import tf_py_environment
from tf_agents.environments import suite_gym

### Running a test for average score using random action over 100 episodes

In [60]:
env = suite_gym.load(ENV_NAME)
tf_env = tf_py_environment.TFPyEnvironment(env)

time_step = tf_env.reset()
rewards = []
steps = []
num_episodes = 100

for _ in range(num_episodes):
    episode_reward = 0
    episode_steps = 0
    while not time_step.is_last():
        action = tf.random.uniform([1], 0, 2, dtype=tf.int32)
        #print(action)
        time_step = tf_env.step(action)
        episode_steps += 1
        episode_reward += time_step.reward.numpy()
    rewards.append(episode_reward)
    steps.append(episode_steps)
    time_step = tf_env.reset()

num_steps = np.sum(steps)
avg_length = np.mean(steps)
avg_reward = np.mean(rewards)

print('num_episodes:', num_episodes, 'num_steps:', num_steps)
print('avg_length', avg_length, 'avg_reward:', avg_reward)

num_episodes: 100 num_steps: 2187
avg_length 21.87 avg_reward: 21.87


### Importing Q Policy libraries
    # Agents trained will be based on Deep Q-Learning network (DQN)
    # Q is quality of given moves
    # It Predicts Q value for each discrete action Q: State * Action --> Reward.
![](bellman.svg)

In [10]:
from tf_agents.specs import tensor_spec
from tf_agents.networks import network
from tf_agents.policies import q_policy
from tf_agents.trajectories import time_step as ts

In [64]:
input_tensor_spec = tensor_spec.TensorSpec((4,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)
action_spec = tensor_spec.BoundedTensorSpec((),
                                            tf.int32,
                                            minimum=0,
                                            maximum=1)

num_actions = action_spec.maximum - action_spec.minimum + 1
#print(action_spec, num_actions)

In [80]:
class QNetwork(network.Network):

    def __init__(self, input_tensor_spec, action_spec, num_actions=num_actions, name=None):
        super(QNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)
        self._sub_layers = [
            tf.keras.layers.Dense(num_actions),
        ]

    def call(self, inputs, step_type=None, network_state=()):
        del step_type
        inputs = tf.cast(inputs, tf.float32)
        for layer in self._sub_layers:
            inputs = layer(inputs)
        return inputs, network_state


batch_size = 2
observation = tf.ones([batch_size] + time_step_spec.observation.shape.as_list())
time_steps = ts.restart(observation, batch_size=batch_size)

my_q_network = QNetwork(
    input_tensor_spec=input_tensor_spec,
    action_spec=action_spec)

my_q_policy = q_policy.QPolicy(
    time_step_spec, action_spec, q_network=my_q_network)

action_step = my_q_policy.action(time_steps)
distribution_step = my_q_policy.distribution(time_steps)

print('Action:')
print(action_step.action)

print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor([1 0], shape=(2,), dtype=int32)
Action distribution:
tfp.distributions.Categorical("Categorical", batch_shape=[2], event_shape=[], dtype=int32)
