<a href="https://colab.research.google.com/github/ereshmittal/CartPole/blob/main/CartPole_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update
!apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install tf-agents[reverb]
!pip install pyglet

In [None]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import reverb

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common

In [None]:
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [None]:
num_iterations = 30000 # @param {type: 'integer'}
initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =   1# @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

In [None]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)

In [None]:
print(env.time_step_spec().observation)

BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation', minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38])


In [None]:
env.time_step_spec().reward

ArraySpec(shape=(), dtype=dtype('float32'), name='reward')

In [None]:
env.action_spec()

BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=1)

In [None]:
time_step = env.reset()
print("Time Spec: ")
print(time_step)

action = np.array(1, dtype=np.int32)

next_time_step = env.step(action)
print("next time step: ")
print(next_time_step)

Time Spec: 
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([0.01695262, 0.02168   , 0.02792102, 0.02186547], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})
next time step: 
TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([ 0.01738622,  0.21639064,  0.02835833, -0.26187894], dtype=float32),
 'reward': array(1., dtype=float32),
 'step_type': array(1, dtype=int32)})


In [None]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

In [None]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [None]:
PIL.Image.fromarray(env.render())

In [None]:
tensor_spec.from_spec(env.action_spec()), env.action_spec()

(BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(1)),
 BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=1))

In [None]:
fc_layer_params = (100,50)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum +1  

In [None]:
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation='relu',
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'
      )
  )

dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03
    ),
    bias_initializer = tf.keras.initializers.Constant(0.2)
)

q_net = sequential.Sequential(dense_layers+[q_values_layer])

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter = train_step_counter
)
agent.initialize()

In [None]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [None]:
def compute_avg_return(environment, policy, num_episodes):
  total_return = 0.0

  for _ in range(num_episodes):
    time_step=environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
    total_return += episode_return

  avg_return = total_return/num_episodes
  return avg_return.numpy()[0]

In [None]:
compute_avg_return(eval_env, policy)