In [28]:
#!pip install gym
#!pip install tensorflow
#!pip install pyglet
#!pip install keras
#!pip install tf-agents

#!pip install reverb

Found existing installation: reverb 2.0.1
Uninstalling reverb-2.0.1:
  Would remove:
    /Users/sahilbohot/miniforge3/lib/python3.8/site-packages/*
  Would not remove (might be manually added):
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [18]:
import tensorflow as tf
tf.__version__

'2.9.2'

In [12]:
import tensorflow as tf
tf.__version__

'2.9.2'

### Import gym library

In [46]:
import gym
import random
import numpy as np

In [41]:
env_name = 'CartPole-v0'

In [42]:
env = gym.make(env_name)

### Reset the enviornment to its initial state and return 4 values in array

![Observation](obs.png)

In [43]:
obs = env.reset()
obs

array([-0.0429323 , -0.01812948, -0.01866479, -0.04680137], dtype=float32)

### observation_space returns the information about the environment space
    # Box data type i.e. for continous observation
    # returned Box object represents
    # Box([[lower range of obs], [upper range obs], (number of dimensions), data type])
![](env.png)

In [44]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

### action_space returns the Discrete object
### sequence of integers, reprents the actions

![](action.png)

In [45]:
env.action_space

Discrete(2)

In [46]:
#env.render()

### Termination state is reached when one of these conditions are met
### and done parameter is set to True

![](eps.png)

### Here a simple demonstartion of pushing cart to one side
### To run the cell below change cell's type from Raw to Code

### To take an action we need step() function
    # step function returns 4 parameters
    # [[obs state], reward, terminal state(done = bool),info]

### Importing the tensorflow environments
    # they generate tensors
    # and replay buffer can be used to train the agent

In [47]:
import tensorflow as tf
from tf_agents.environments import tf_py_environment
from tf_agents.environments import suite_gym

### Running a test for average score using random action over 100 episodes

In [48]:
env = suite_gym.load(env_name)
tf_env = tf_py_environment.TFPyEnvironment(env)

time_step = tf_env.reset()
rewards = []
steps = []
num_episodes = 100

for _ in range(num_episodes):
    episode_reward = 0
    episode_steps = 0
    while not time_step.is_last():
        action = tf.random.uniform([1], 0, 2, dtype=tf.int32)
        #print(action)
        time_step = tf_env.step(action)
        episode_steps += 1
        episode_reward += time_step.reward.numpy()
    rewards.append(episode_reward)
    steps.append(episode_steps)
    time_step = tf_env.reset()

num_steps = np.sum(steps)
avg_length = np.mean(steps)
avg_reward = np.mean(rewards)

print('num_episodes:', num_episodes, 'num_steps:', num_steps)
print('avg_length', avg_length, 'avg_reward:', avg_reward)

num_episodes: 100 num_steps: 2197
avg_length 21.97 avg_reward: 21.97


### Importing Q Policy libraries
    # Agents trained will be based on Deep Q-Learning network (DQN)
    # Q is quality of given moves
    # It Predicts Q value for each discrete action Q: State * Action --> Reward.
    # Bellman equation is a detailed explanation of this algorithm
![](bellman.svg)

In [49]:
from tf_agents.specs import tensor_spec
from tf_agents.networks import network
from tf_agents.policies import q_policy
from tf_agents.trajectories import time_step as ts

In [50]:
input_tensor_spec = tensor_spec.TensorSpec((4,), tf.float32)
time_step_spec = ts.time_step_spec(input_tensor_spec)
action_spec = tensor_spec.BoundedTensorSpec((),
                                            tf.int32,
                                            minimum=0,
                                            maximum=1)

num_actions = action_spec.maximum - action_spec.minimum + 1
print(action_spec, "\n\n Number of actions: \t", num_actions)

BoundedTensorSpec(shape=(), dtype=tf.int32, name=None, minimum=array(0, dtype=int32), maximum=array(1, dtype=int32)) 

 Number of actions: 	 2


In [51]:
class QNetwork(network.Network):

    def __init__(self, input_tensor_spec, action_spec, num_actions=num_actions, name=None):
        super(QNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)
        self._sub_layers = [
            tf.keras.layers.Dense(num_actions),
        ]

    def call(self, inputs, step_type=None, network_state=()):
        del step_type
        inputs = tf.cast(inputs, tf.float32)
        for layer in self._sub_layers:
            inputs = layer(inputs)
        return inputs, network_state


batch_size = 2
observation = tf.ones([batch_size] + time_step_spec.observation.shape.as_list())
time_steps = ts.restart(observation, batch_size=batch_size)

my_q_network = QNetwork(
    input_tensor_spec=input_tensor_spec,
    action_spec=action_spec)

my_q_policy = q_policy.QPolicy(
    time_step_spec, action_spec, q_network=my_q_network)

action_step = my_q_policy.action(time_steps)
distribution_step = my_q_policy.distribution(time_steps)

print('Action:')
print(action_step.action)

print('Action distribution:')
print(distribution_step.action)

Action:
tf.Tensor([0 1], shape=(2,), dtype=int32)
Action distribution:
tfp.distributions.Categorical("Categorical", batch_shape=[2], event_shape=[], dtype=int32)


### Importing TensorFlow Drivers
    # There are 2 types of drivers
    # DynamicStepDriver, which terminates after a given number of (valid) environment steps
    # DynamicEpisodeDriver, which terminates after a given number of episodes

In [52]:
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import dynamic_episode_driver

In [57]:
num_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
observers = [num_episodes, env_steps]

driver = dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env, my_q_policy, observers, num_episodes=100)

# Initial driver.run will reset the environment and initialize the policy.
final_time_step, policy_state = driver.run()

print('final_time_step', final_time_step)
print('Number of Steps: ', env_steps.result().numpy())
print('Number of Episodes: ', num_episodes.result().numpy())
print('policy_state', policy_state)

final_time_step TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-0.00905133, -0.00811464, -0.00313077,  0.01084824]],
      dtype=float32)>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})
Number of Steps:  4709
Number of Episodes:  100
policy_state ()


### Importing Replay Buffers
    # replay buffers are used to store trajectories of experience when executing a policy in an environment.

In [54]:
from tf_agents.networks import q_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver

In [55]:
q_net = q_network.QNetwork(
    tf_env.time_step_spec().observation,
    tf_env.action_spec(),
    fc_layer_params=(100,))

agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

replay_buffer_capacity = 1000

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity)

# Add an observer that adds to the replay buffer:
replay_observer = [replay_buffer.add_batch]

collect_steps_per_iteration = 10
collect_op = dynamic_step_driver.DynamicStepDriver(
  tf_env,
  agent.collect_policy,
  observers=replay_observer,
  num_steps=collect_steps_per_iteration).run()

In [56]:
collect_op

(TimeStep(
 {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
  'observation': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=
 array([[ 0.03602472,  0.36235705, -0.03630416, -0.62500376]],
       dtype=float32)>,
  'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
  'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>}),
 ())

## Above all cells where the basic idea of building a network and training an agent
## Now we proceed by putting all together the knowledge from tensorflow documentations

### Importing libraries for training the DQN

In [23]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
#import reverb

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

#### Set hyperparameters

In [24]:
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =   1# @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

In [25]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)

In [26]:
env.reset()
#PIL.Image.fromarray(env.render())


TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([-0.01356602,  0.04764484,  0.04043654,  0.04688595], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

In [77]:
#env.close()

In [27]:
print('Observation Spec:')
print(env.time_step_spec().observation)

Observation Spec:
BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation', minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38])


In [28]:
print('Reward Spec:')
print(env.time_step_spec().reward)

Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')


In [29]:
print('Action Spec:')
print(env.action_spec())

Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=1)


In [30]:
time_step = env.reset()
print('Time step:',)
print(time_step, end="\n\n")

action = np.array(1, dtype=np.int32)

next_time_step = env.step(action)
print('Next time step:')
print(next_time_step)

Time step:
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([ 0.02317682, -0.01551813,  0.02965919,  0.03567937], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

Next time step:
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([ 0.02286646,  0.17916623,  0.03037278, -0.2475002 ], dtype=float32),
 'reward': array(1., dtype=float32),
 'step_type': array(1, dtype=int32)})


In [31]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

In [32]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [33]:
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
    return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

In [34]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [35]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [36]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

In [42]:
compute_avg_return(eval_env, eval_policy, num_eval_episodes)

9.6

In [39]:
from collections import deque

In [40]:
memory = deque(maxlen=replay_buffer_max_length)

In [41]:
def data_collection(action, discount, next_step_type, observation, policy, reward, step_type):
        memory.append((action, discount, next_step_type, observation, policy, reward, step_type))
        


### Create class

In [None]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

ENV_NAME = "CartPole-v0"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    env = gym.make(ENV_NAME)
    #score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                #score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()


cartpole()



  super(Adam, self).__init__(name, **kwargs)


