In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import random

In [10]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [11]:
tf.config.experimental.set_visible_devices(tf.config.experimental.list_physical_devices('GPU'), 'GPU')

In [12]:
import random

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity
        self.position = 0

    def store(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) < self.capacity:
            self.buffer.append(data)
        else:
            self.buffer[self.position] = data
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        if len(self.buffer) < batch_size:
            batch = random.sample(self.buffer, len(self.buffer))
        else:
            batch = random.sample(self.buffer, batch_size)

        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

class DDPGAgent:
    def __init__(self, state_size, action_size, gamma=0.99, tau=0.005, buffer_size=100000, batch_size=128):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size

        # Define actor and critic networks
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        self.target_actor = self.build_actor()
        self.target_critic = self.build_critic()

        # Initialize target networks
        self.update_target_networks(1.0)

        # Define optimizers
        self.actor_optimizer = tf.keras.optimizers.Adam()
        self.critic_optimizer = tf.keras.optimizers.Adam()

        # Define replay buffer
        self.replay_buffer = ReplayBuffer(buffer_size)

    def build_actor(self):
        inputs = layers.Input(shape=(self.state_size,))
        x = layers.Dense(64, activation='relu')(inputs)
        x = layers.Dense(64, activation='relu')(x)
        outputs = layers.Dense(self.action_size, activation='tanh')(x)
        model = models.Model(inputs, outputs)
        return model

    def build_critic(self):
        state_inputs = layers.Input(shape=(self.state_size,))
        action_inputs = layers.Input(shape=(self.action_size,))
        x = layers.Concatenate()([state_inputs, action_inputs])
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dense(64, activation='relu')(x)
        outputs = layers.Dense(1)(x)
        model = models.Model([state_inputs, action_inputs], outputs)
        return model

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_size])
        action = self.actor(state)[0]
        action = action + np.random.normal(0, 0.1, self.action_size)  # Add exploration noise
        return np.clip(action, -2.0, 2.0)
    def update_target_networks(self, tau):
        weights_actor = []
        weights_critic = []
        target_weights_actor = self.target_actor.get_weights()
        target_weights_critic = self.target_critic.get_weights()
        actor_weights = self.actor.get_weights()
        critic_weights = self.critic.get_weights()

        for i in range(len(target_weights_actor)):
            weights_actor.append(tau * actor_weights[i] + (1 - tau) * target_weights_actor[i])

        for i in range(len(target_weights_critic)):
            weights_critic.append(tau * critic_weights[i] + (1 - tau) * target_weights_critic[i])

        self.target_actor.set_weights(weights_actor)
        self.target_critic.set_weights(weights_critic)

    def train(self, batch_size=64):
        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

        # Update critic
        target_actions = self.target_actor(next_state)
        target_values = self.target_critic([next_state, target_actions])
        targets = reward + self.gamma * target_values * (1 - done)

        with tf.GradientTape() as tape:
            values = self.critic([state, action])
            critic_loss = tf.reduce_mean(tf.square(targets - values))

        critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grad, self.critic.trainable_variables))

        # Update actor
        with tf.GradientTape() as tape:
            actions = self.actor(state)
            values = self.critic([state, actions])
            actor_loss = -tf.reduce_mean(values)

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grad, self.actor.trainable_variables))

        # Update target networks
        self.update_target_networks(self.tau)

In [None]:
# Create the environment and agent
env = gym.make('Pendulum-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
agent = DDPGAgent(state_size, action_size)

# Training loop
max_episodes = 1000
scores = []

for episode in range(max_episodes):
    state = env.reset()
    episode_reward = 0
    counter_each_episode = 0
    while True:
        counter_each_episode += 1
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.replay_buffer.store(state, action, reward, next_state, done)
        agent.train()
        state = next_state
        episode_reward += reward

        if done:
            scores.append(episode_reward)
            break
    if episode % 1 == 0:
            print(f"Episode {episode}: Reward in episode = {episode_reward}, Reward/transition: {episode_reward/counter_each_episode}")

# Plot the learning curve
plt.plot(scores)
plt.title('Learning Curve')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

Episode 0: Reward in episode = -1064.9816823463, Reward/transition: -5.3249084117315
Episode 1: Reward in episode = -1454.0166178124841, Reward/transition: -7.270083089062421
Episode 2: Reward in episode = -1161.7592051311788, Reward/transition: -5.8087960256558935
Episode 3: Reward in episode = -1208.4632637842267, Reward/transition: -6.0423163189211335
Episode 4: Reward in episode = -1190.9998840529593, Reward/transition: -5.954999420264796
Episode 5: Reward in episode = -1564.2688467377066, Reward/transition: -7.821344233688532
Episode 6: Reward in episode = -1057.106397368276, Reward/transition: -5.28553198684138
Episode 7: Reward in episode = -1701.1572701239008, Reward/transition: -8.505786350619504
Episode 8: Reward in episode = -1683.3885025154689, Reward/transition: -8.416942512577345
Episode 9: Reward in episode = -1251.3685415017535, Reward/transition: -6.256842707508768
Episode 10: Reward in episode = -1243.4867568228892, Reward/transition: -6.217433784114446
Episode 11: Re