In [1]:
# pip install matplotlib

In [2]:
import os
import json
import numpy as np
import gymnasium as gym
from matplotlib import pyplot as plt
from PIL import Image
import PIL.ImageDraw as ImageDraw
import imageio
import os
import wandb
import numpy as np
from time import time
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
# from DQN_analysis import DQN
env = gym.make('Pendulum-v1', g=9.81,render_mode="rgb_array")

In [3]:
lr = 0.0009
epsilon = 1.0
 # we want epsilon to be 0.01 after n episodes
gamma = 0.99
training_episodes = 200
epsilon_decay = (0.01 / epsilon) ** (1/150) 
target_update_interval = 5

In [4]:
class ReplayBuffer:
    def __init__(self, max_length, state_size, action_size, is_sarsa=False):
        self.is_sarsa = is_sarsa
        self.memory_counter = 0
        self.max_length = max_length
        self.state_memory = np.zeros((self.max_length, state_size))
        self.new_state_memory = np.zeros((self.max_length, state_size))
        self.action_memory = np.zeros((self.max_length, action_size), dtype=np.int8)
        if is_sarsa:
            self.new_action_memory = np.zeros((self.max_length, action_size), dtype=np.int8)
        self.reward_memory = np.zeros(self.max_length)
        self.done_memory = np.zeros(self.max_length, dtype=np.float32)

    def append(self, state, action, reward, new_state, done, new_action=None):
        idx = self.memory_counter % self.max_length

        self.state_memory[idx] = state
        self.action_memory[idx] = action  # Assuming action is a single float

        if self.is_sarsa:
            self.new_action_memory[idx] = new_action  # Assuming new_action is a single float

        self.new_state_memory[idx] = new_state
        self.reward_memory[idx] = reward
        self.done_memory[idx] = 1 - done
        self.memory_counter += 1

    def sample(self, batch_size):
        max_memory = min(self.memory_counter, self.max_length)
        sampled_batch = np.random.choice(max_memory, batch_size, replace=False)

        states= self.state_memory[sampled_batch]
        actions = self.action_memory[sampled_batch]
        rewards= self.reward_memory[sampled_batch]
        new_states = self.new_state_memory[sampled_batch]
        if self.is_sarsa:
            new_actions = self.new_action_memory[sampled_batch]
        dones = self.done_memory[sampled_batch]

        if not self.is_sarsa:
            return states, actions, rewards, new_states, dones
        else:
            return states, actions, rewards, new_states, new_actions, dones

In [5]:
class MyEpisodeSaver:
    def __init__(self, env, frames, algo, episode_number):
        self.env = env
        self.frames = frames
        self.dir = f'./gifs/{algo}/'
        self.episode_number = episode_number
        self.fname = f'episode_{self.episode_number}.gif'

        if not os.path.exists('./gifs'):
            os.mkdir('./gifs')

        if not os.path.exists(self.dir):
            os.mkdir(self.dir)

        self.labeled_frames = self.label_frames()

    def label_frames(self):
        labeled_frames = []

        for frame in self.frames:
            img = Image.fromarray(frame)
            draw = ImageDraw.Draw(img)
            # draw on each frame
            draw.text((10, 10), f'Episode: {self.episode_number}', fill=(255, 255, 255))
            labeled_frames.append(np.array(img))

        return labeled_frames

    def save(self):
        # labeled_frames = self.label_frames()
        # imageio.mimsave(self.dir + self.fname, labeled_frames, fps=60)
        imageio.mimsave(self.dir + self.fname, self.labeled_frames, fps=60)

In [6]:
import imageio, random
import pandas as pd
from keras.regularizers import l2

class DDQN:
    def __init__(self, env, lr, gamma, epsilon, epsilon_decay, epsilon_min=0.01, batch_size=128, fname='DDQN_model_improvement'):
        self.env = env
        self.action_size = 10
        self.state_size = env.observation_space.shape[0]
        self.action_space = [i for i in range(self.action_size)] 
        self.discrete_actions = np.linspace(-2.0, 2.0, num = self.action_size)

        self.alpha = lr  # learning rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.tau = 0.01
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.target_update_interval = 100

        self.fname = fname
        self.memory = ReplayBuffer(10000, self.state_size, self.action_size)
        self.dqn_model = self.create_dqn('DQN-Model')
        self.update_counter = 0
        self.dqn_target_model = self.create_dqn('DQN-Target-Model')


    def create_dqn(self, name):
        model = Sequential([
            Dense(units=64, activation='leaky_relu', input_shape=(self.state_size,), kernel_regularizer=l2(0.01)),
            Dense(units=32, activation='leaky_relu', kernel_regularizer=l2(0.01)),
            Dense(units=self.action_size, activation='linear', kernel_regularizer=l2(0.01))
        ], name=name)

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.alpha))
        return model

    def remember(self, state, action, reward, new_state, done):
        self.memory.append(state, action, reward, new_state, done)

    def act(self, state):
        state = np.reshape(state, [1, self.state_size])
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_values = self.dqn_model.predict(state, verbose=0)
            return np.argmax(q_values[0])

        # q_values = self.dqn_model.predict(state, verbose=0)[0]

        # # Sample action from a normal distribution centered around the action with the highest Q-value
        # chosen_action = np.random.normal(loc=self.action_space[np.argmax(q_values)], scale=self.epsilon)

        # # Clip the action to be within the action space
        # chosen_action = np.clip(chosen_action, self.action_space.min(), self.action_space.max())

        # return chosen_action

    def update(self):
        if self.memory.memory_counter > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample(self.batch_size)

            q_current = self.dqn_target_model.predict(new_state, verbose=0) 
            q_future = self.dqn_model.predict(new_state, verbose=0)
            q_target = self.dqn_model.predict(state, verbose=0)

            best_actions = np.argmax(q_current, axis=1)

            for i in range(self.batch_size):
                q_target[i, action[i]] = reward[i] + self.gamma * q_future[i, best_actions[i]] * done[i]

            self.dqn_model.fit(x=state, y=q_target, verbose=0)
            self.update_counter += 1

            # soft update with tau
            main_weights = self.dqn_model.get_weights()
            target_weights = self.dqn_target_model.get_weights()

            for i in range(len(target_weights)):
                target_weights[i] = self.tau * main_weights[i] + (1 - self.tau) * target_weights[i]
            
            self.dqn_target_model.set_weights(target_weights)

            # # Soft target update with tau
            # if self.update_counter % self.target_update_interval == 0:
            #     main_weights = self.dqn_model.get_weights()
            #     target_weights = self.dqn_target_model.get_weights()

            #     new_weights = [(1 - self.tau) * target_w + self.tau * main_w for target_w, main_w in zip(target_weights, main_weights)]
            #     self.dqn_target_model.set_weights(new_weights)

    def train(self, n_episodes, max_steps=200, log_wandb=False,
              update=True, save_episodes=True, save_interval=10):
        history = {'reward': [], 'avg_reward_100': [], 'steps': []}

        for episode in range(n_episodes):
            # print(episode)
            start_time = time()
            state, info = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            episode_reward = 0
            episode_steps = 0
            episode_frames  = []
            # self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

            for _ in range(max_steps):
                action = self.act(state)

                new_state, reward, done, _, _ = self.env.step([self.discrete_actions[action]]) # 5 values returned
                
                theta = new_state[0]
                theta_dt = new_state[1]

                # Modify our reward
                reward = reward + (0.05 * (np.pi - np.abs(theta)))
                # reward = reward - (0.01 * np.abs(theta_dt))

                new_state = np.reshape(new_state, [1, self.state_size])
                episode_frames.append(self.env.render())

                if update:
                    self.remember(state, action, reward, new_state, done)
                    self.update()

                state = new_state
                episode_reward += reward
                episode_steps += 1

                if done:
                    break

            if log_wandb:
                wandb.log({
                    'reward': episode_reward,
                    'steps': episode_steps,
                    'epsilon': self.epsilon
                })

            if save_episodes:
                if (episode + 1) % save_interval == 0 or (episode == 0):
                    s = MyEpisodeSaver(self.env, episode_frames , self.fname, episode + 1)
                    s.save()

            print(f'[EP {episode + 1}/{n_episodes}] - Reward: {episode_reward:.4f} - Steps: {episode_steps} - Eps: {self.epsilon:.4f} - Time: {time() - start_time:.2f}s')
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
            history['reward'].append(episode_reward)
            history['avg_reward_100'].append(np.mean(history['reward'][-100:]))
            history['steps'].append(episode_steps)
            df = pd.DataFrame(history)
            df.to_csv(f'./assets/{self.fname}.csv')

        self.env.close()

        if log_wandb:
            wandb.finish()

        self.save(f'{self.fname}.h5')

        return history

    def save(self, fname):
        if not os.path.exists('./assets'):
            os.mkdir('./assets')

        self.dqn_model.save(f'./assets/{fname}')

    def load(self, fname):
        self.dqn_model = load_model(f'./assets/{fname}')

In [7]:
# pip install gymnasium[classic-control]

In [8]:
model = DDQN(env=env, lr=lr, gamma=gamma, epsilon=epsilon, epsilon_decay=epsilon_decay)
model.train(training_episodes,save_episodes=True, save_interval=20)

[EP 1/200] - Reward: -1364.7476 - Steps: 200 - Eps: 1.0000 - Time: 16.06s
[EP 2/200] - Reward: -1524.8235 - Steps: 200 - Eps: 0.9698 - Time: 33.10s
[EP 3/200] - Reward: -1668.3026 - Steps: 200 - Eps: 0.9404 - Time: 33.15s
[EP 4/200] - Reward: -994.0826 - Steps: 200 - Eps: 0.9120 - Time: 33.22s
[EP 5/200] - Reward: -955.3590 - Steps: 200 - Eps: 0.8844 - Time: 33.69s
[EP 6/200] - Reward: -1258.7390 - Steps: 200 - Eps: 0.8577 - Time: 33.85s
[EP 7/200] - Reward: -901.9402 - Steps: 200 - Eps: 0.8318 - Time: 34.18s
[EP 8/200] - Reward: -1247.6549 - Steps: 200 - Eps: 0.8066 - Time: 34.40s
[EP 9/200] - Reward: -1289.0888 - Steps: 200 - Eps: 0.7822 - Time: 34.85s
[EP 10/200] - Reward: -956.6223 - Steps: 200 - Eps: 0.7586 - Time: 35.19s
[EP 11/200] - Reward: -1162.7771 - Steps: 200 - Eps: 0.7356 - Time: 35.04s
[EP 12/200] - Reward: -988.5713 - Steps: 200 - Eps: 0.7134 - Time: 35.41s
[EP 13/200] - Reward: -991.2593 - Steps: 200 - Eps: 0.6918 - Time: 35.66s
[EP 14/200] - Reward: -1391.1684 - Steps

{'reward': [-1364.7476202295459,
  -1524.8234689937105,
  -1668.3026265506135,
  -994.0826174615685,
  -955.3590457826151,
  -1258.7389856666064,
  -901.940193053861,
  -1247.6549181221058,
  -1289.088783954735,
  -956.6222575547481,
  -1162.7771157281743,
  -988.5713403646057,
  -991.2592778102706,
  -1391.168356505743,
  -1151.6666964610224,
  -1251.41981791936,
  -1295.2701146124516,
  -1292.2987916487596,
  -1185.8989291557982,
  -1188.8007456615883,
  -754.0424333784166,
  -1078.2300773329805,
  -881.3765264780086,
  -867.190618229208,
  -854.9915530148867,
  -955.8387656330233,
  -833.4274053401699,
  -841.6024795621739,
  -744.20110522944,
  -853.7047343065034,
  -623.4803195270993,
  -721.2865424892296,
  -730.8109654951394,
  -731.3506627435964,
  -730.9696240714501,
  -723.8630562864266,
  -727.885292898631,
  -611.0632675368742,
  -663.8710086733788,
  -611.8786916015102,
  -607.8311468986478,
  -612.0386123539918,
  -678.0072006014753,
  -604.5427194557884,
  -716.154047113