In [1]:
import gym
import random
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, concatenate

In [2]:
def ou_noise(x, mu, sigma, theta):
    return theta * (mu - x) + sigma * np.random.randn(1)

In [3]:
def actor_network(env):
    actor = Sequential()
    actor.add(Dense(32, input_shape=(env.observation_space.shape[0],)))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(env.action_space.shape[0]))
    actor.add(Activation('linear'))
    return actor

In [4]:
def critic_network(env):
    action_input = tf.keras.Input((env.action_space.shape[0],))
    state_input = tf.keras.Input((env.observation_space.shape[0],))
    concatenate_input = concatenate([state_input, action_input])
    critic = Dense(32)(concatenate_input)
    critic = Activation('relu')(critic)
    critic = Dense(16)(critic)
    critic = Activation('relu')(critic)
    critic = Dense(1)(critic)
    critic = Activation('linear')(critic)
    critic = tf.keras.Model(inputs=[state_input, action_input], outputs=critic)
    return critic

In [5]:
def actor_loss(critic, state):
    def maximize_Q(action_true, action_pred):
        return - critic([tf.reshape(state, (1, -1)), tf.reshape(action_pred, (1, action_pred.shape[1]))])
    return maximize_Q

In [6]:
def critic_loss():
    def mse(y, Q):
        return tf.reduce_mean(tf.square(y - Q))
    return mse

In [7]:
def target_network_loss(tau, weights, target_weights):
    updated_weights = []
    for i, weight in enumerate(weights):
        updated_weights.append(tau * weight + (1 - tau) * target_weights[i])
    return updated_weights

In [8]:
def set_replay_buffer(mem_limit, env):
    memory_buffer = []
    for _ in range(mem_limit):
        observation = env.reset().reshape(1, -1)
        action = env.action_space.sample().reshape(1, -1)
        next_observation, reward, _, __ = env.step(action)
        memory_buffer.append([observation, action, reward, next_observation.reshape(1, -1)])
    return memory_buffer

In [9]:
mu = 0
N = 100
tau = 0.99
sigma = 0.3
theta = 0.15
mem_limit = 10000
discount_factor = 0.4
env = gym.make("Ant-v2")

In [10]:
actor = actor_network(env)
actor.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                3584      
_________________________________________________________________
activation (Activation)      (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 136       
_________________________________________________________________
activation_2 (Activation)    (None, 8)                 0         
Total params: 4,248
Trainable params: 4,248
Non-trainable params: 0
______________________________________________________

In [11]:
critic = critic_network(env)
critic.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 111)]        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 119)          0           input_2[0][0]                    
                                                                 input_1[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 32)           3840        concatenate[0][0]            

In [12]:
target_actor = actor_network(env)
target_actor.set_weights(actor.get_weights())
target_actor.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 32)                3584      
_________________________________________________________________
activation_6 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 16)                528       
_________________________________________________________________
activation_7 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 136       
_________________________________________________________________
activation_8 (Activation)    (None, 8)                 0         
Total params: 4,248
Trainable params: 4,248
Non-trainable params: 0
____________________________________________________

In [13]:
target_critic = critic_network(env)
target_critic.set_weights(critic.get_weights())
target_critic.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 111)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 119)          0           input_4[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 32)           3840        concatenate_1[0][0]        

In [14]:
actor.compile(optimizer='adam', loss=actor_loss(critic, env.reset().reshape(-1, 1)))

In [15]:
critic.compile(optimizer='adam', loss=critic_loss())

In [16]:
replay_buffer = set_replay_buffer(mem_limit, env)

In [17]:
for episode in tqdm(range(10)):
    observation, done = env.reset().reshape(1, -1), False
    avg_reward = 0
    count = 0
    noise = np.random.randn(1, env.action_space.shape[0])

    while not done:
        action = actor.predict(observation) + ou_noise(noise, mu, sigma, theta)
        next_observation, reward, done, _ = env.step(action)
        next_observation = next_observation.reshape(1, -1)
        replay_buffer.append([observation, action, reward, next_observation])
        minibatch = random.sample(replay_buffer, (N % len(replay_buffer)))
        y = reward + (discount_factor * target_critic.predict(
            [next_observation, target_actor.predict(next_observation)]))

        # y = []
        # observations = []
        # actions = []
        # for index in range(len(minibatch)):
        #     y.append(minibatch[index][2] + discount_factor * target_critic.predict([minibatch[index][3], target_actor.predict(minibatch[index][3])]).flatten())
        #     observations.append(minibatch[index][0].flatten())
        #     actions.append(minibatch[index][1].flatten())
        #
        # observations = np.array(observations).reshape(N, env.observation_space.shape[0])
        # actions = np.array(actions).reshape(N, env.action_space.shape[0])
        # y = np.array(y).reshape(N, 1)

        critic.fit(x=[observation, action], y=y, verbose=0)
        actor.fit(x=observation, y=action, verbose=0)

        target_critic.set_weights(target_network_loss(tau, critic.weights, target_critic.weights))
        target_actor.set_weights(target_network_loss(tau, actor.weights, target_actor.weights))

        count += 1
        avg_reward += reward
        observation = next_observation

#         print("Step: ", count + 1, "Total Reward per epoch: ", avg_reward)
    print("Average Reward:", avg_reward / count, " Step:", count)

 10%|█         | 1/10 [00:38<05:46, 38.45s/it]

Average Reward: -3.60272475283096  Step: 173


 20%|██        | 2/10 [04:17<19:19, 144.93s/it]

Average Reward: -2.627168649728635  Step: 1000


 30%|███       | 3/10 [07:51<20:35, 176.49s/it]

Average Reward: -1.9677854651851605  Step: 1000


 40%|████      | 4/10 [11:15<18:43, 187.17s/it]

Average Reward: -1.294204787597786  Step: 950


 50%|█████     | 5/10 [14:49<16:24, 196.87s/it]

Average Reward: -1.7045887067176337  Step: 1000


 60%|██████    | 6/10 [18:23<13:30, 202.74s/it]

Average Reward: -0.7855829708490001  Step: 1000


 70%|███████   | 7/10 [21:59<10:20, 206.96s/it]

Average Reward: -0.5709716178153303  Step: 1000


 80%|████████  | 8/10 [25:34<06:59, 209.52s/it]

Average Reward: -0.6451678852228392  Step: 1000


 90%|█████████ | 9/10 [29:09<03:31, 211.26s/it]

Average Reward: -0.9009272506314983  Step: 1000


100%|██████████| 10/10 [32:43<00:00, 196.35s/it]

Average Reward: -0.5746881089840349  Step: 1000





In [19]:
obs = env.reset().reshape(1, -1)
while not done:
    obs, _, __, ___ = env.step(actor.predict(obs))
    obs = obs.reshape(1, -1)
env.close()