In [1]:
import tensorflow as tf
import numpy as np
import gym
from tensorflow.keras.models import load_model
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 26.7MB/s eta 0:00:01[K     |█▌                              | 20kB 5.7MB/s eta 0:00:01[K     |██▏                             | 30kB 8.1MB/s eta 0:00:01[K     |███                             | 40kB 7.5MB/s eta 0:00:01[K     |███▋                            | 51kB 6.7MB/s eta 0:00:01[K     |████▍                           | 61kB 7.6MB/s eta 0:00:01[K     |█████▏                          | 71kB 7.9MB/s eta 0:00:01[K     |█████▉                          | 81kB 8.7MB/s eta 0:00:01[K     |██████▋                         | 92kB 8.9MB/s eta 0:00:01[K     |███████▎                        | 102kB 8.9MB/s eta 0:00:01[K     |████████                        | 112kB 8.9MB/s eta 0:00:01[K     |████████▊                       | 12

In [2]:
env= gym.make("LunarLanderContinuous-v2")
state_low = env.observation_space.low
state_high = env.observation_space.high
action_low = env.action_space.low 
action_high = env.action_space.high
print(state_low)
print(state_high)
print(action_low)
print(action_high)

[-inf -inf -inf -inf -inf -inf -inf -inf]
[inf inf inf inf inf inf inf inf]
[-1. -1.]
[1. 1.]


In [3]:
 len(env.action_space.high)

2

In [4]:
class RBuffer():
    def __init__(self, maxsize, statedim, naction):
    self.cnt = 0
    self.maxsize = maxsize
    self.state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
    self.action_memory = np.zeros((maxsize, naction), dtype=np.float32)
    self.reward_memory = np.zeros((maxsize,), dtype=np.float32)
    self.next_state_memory = np.zeros((maxsize, *statedim), dtype=np.float32)
    self.done_memory = np.zeros((maxsize,), dtype= np.bool)

    def storexp(self, state, next_state, action, done, reward):
    index = self.cnt % self.maxsize
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.next_state_memory[index] = next_state
    self.done_memory[index] = 1- int(done)
    self.cnt += 1

    def sample(self, batch_size):
    max_mem = min(self.cnt, self.maxsize)
    batch = np.random.choice(max_mem, batch_size, replace= False)  
    states = self.state_memory[batch]
    next_states = self.next_state_memory[batch]
    rewards = self.reward_memory[batch]
    actions = self.action_memory[batch]
    dones = self.done_memory[batch]
    return states, next_states, rewards, actions, dones



In [5]:
class Critic(tf.keras.Model):
    def __init__(self):
    super(Critic, self).__init__()
    self.f1 = tf.keras.layers.Dense(512, activation='relu')
    self.f2 = tf.keras.layers.Dense(512, activation='relu')
    self.v =  tf.keras.layers.Dense(1, activation=None)

    def call(self, inputstate, action):
    x = self.f1(tf.concat([inputstate, action], axis=1))
    x = self.f2(x)
    x = self.v(x)
    return x


class Actor(tf.keras.Model):
    def __init__(self, no_action):
    super(Actor, self).__init__()    
    self.f1 = tf.keras.layers.Dense(512, activation='relu')
    self.f2 = tf.keras.layers.Dense(512, activation='relu')
    self.mu =  tf.keras.layers.Dense(no_action, activation='tanh')

    def call(self, state):
    x = self.f1(state)
    x = self.f2(x)
    x = self.mu(x)  
    return x

 

In [6]:
class Agent():
    def __init__(self, n_action= len(env.action_space.high)):
    self.actor_main = Actor(n_action)
    self.actor_target = Actor(n_action)
    self.critic_main = Critic()
    self.critic_target = Critic()
    self.batch_size = 64
    self.n_actions = len(env.action_space.high)
    self.a_opt = tf.keras.optimizers.Adam(1e-4)
    # self.actor_target = tf.keras.optimizers.Adam(.001)
    self.c_opt = tf.keras.optimizers.Adam(1e-4)
    # self.critic_target = tf.keras.optimizers.Adam(.002)
    self.memory = RBuffer(1_00_000, env.observation_space.shape, len(env.action_space.high))
    self.trainstep = 0
    self.replace = 5
    self.gamma = 0.99
    self.min_action = env.action_space.low[0]
    self.max_action = env.action_space.high[0]

    def act(self, state, evaluate=False):
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        actions = self.actor_main(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=0.1)

            actions = self.max_action * (tf.clip_by_value(actions, self.min_action, self.max_action))
      #print(actions)
      return actions[0]


    def savexp(self,state, next_state, action, done, reward):
        self.memory.storexp(state, next_state, action, done, reward)

    def update_target(self):
        self.actor_target.set_weights(self.actor_main.get_weights())
        self.critic_target.set_weights(self.critic_main.get_weights())

  
  def train(self):
        if self.memory.cnt < self.batch_size:
            return 


        states, next_states, rewards, actions, dones = self.memory.sample(self.batch_size)
  
        states = tf.convert_to_tensor(states, dtype= tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype= tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype= tf.float32)
        actions = tf.convert_to_tensor(actions, dtype= tf.float32)
      #dones = tf.convert_to_tensor(dones, dtype= tf.bool)

      with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            
            target_actions = self.actor_target(next_states)
            target_next_state_values = tf.squeeze(self.critic_target(next_states, target_actions), 1)
            critic_value = tf.squeeze(self.critic_main(states, actions), 1)
            target_values = rewards + self.gamma * target_next_state_values * dones
            critic_loss = tf.keras.losses.MSE(target_values, critic_value)

            new_policy_actions = self.actor_main(states)
            actor_loss = -self.critic_main(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

            grads1 = tape1.gradient(actor_loss, self.actor_main.trainable_variables)
            grads2 = tape2.gradient(critic_loss, self.critic_main.trainable_variables)
            self.a_opt.apply_gradients(zip(grads1, self.actor_main.trainable_variables))
            self.c_opt.apply_gradients(zip(grads2, self.critic_main.trainable_variables))

        if self.trainstep % self.replace == 0:
               self.update_target()
           
               self.trainstep +=1

# At 722 steps we get a total reward of 283.80202996342393 and avg reward of 226.905081999605

In [None]:
agent = Agent(2)
tf.random.set_seed(336699)

episods = 2000
ep_reward = []
total_avgr = []
target = False

for s in range(episods):
    if target == True:
    break
    total_reward = 0 
    state = env.reset()
    done = False

    while not done:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    agent.savexp(state, next_state, action, done, reward)
    agent.train()
    state = next_state
    total_reward += reward
    if done:
        ep_reward.append(total_reward)
        avg_reward = np.mean(ep_reward[-100:])
        total_avgr.append(avg_reward)
        print("total reward after {} steps is {} and avg reward is {}".format(s, total_reward, avg_reward))
        if int(avg_reward) == 200:
            target = True





total reward after 0 steps is -302.3123078236962 and avg reward is -302.3123078236962
total reward after 1 steps is -68.06222041058956 and avg reward is -185.1872641171429
total reward after 2 steps is -123.22902439223049 and avg reward is -164.5345175421721
total reward after 3 steps is -339.0951266678694 and avg reward is -208.17466982359642
total reward after 4 steps is -147.729761712367 and avg reward is -196.08568820135054
total reward after 5 steps is -147.52981650218874 and avg reward is -187.9930429181569
total reward after 6 steps is -161.2887440736686 and avg reward is -184.17814308323
total reward after 7 steps is -34.60345838283136 and avg reward is -165.48130749568017
total reward after 8 steps is -105.17495478960271 and avg reward is -158.78060163944934
total reward after 9 steps is -323.0676013782203 and avg reward is -175.20930161332643
total reward after 10 steps is -71.28975136949644 and avg reward is -165.76206977297826
total reward after 11 steps is -30.715540957463