<a href="https://colab.research.google.com/github/dane-meister/Machine-Learning-Algos/blob/main/dqn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DQN

The goal is to implement DQN algorithm.

- We will be using 4 Gymnasium environments, 2 state-based (Cartpole-v1 and MountainCar-v0) and 2 image-based (ALE/Breakout-v5 and ALE/Boxing-v5).
- We will use epsilon-greedy with epsilon-decay for the off-policy.
- We will use an MLP to implement Q network for state-based environemnts, and CNN for image-based environments.
- We will need the **GPU runtime** of Colab. Make sure you are on it. Please be patient when training on image-based environments.

In [None]:
!pip install gymnasium[classic-control,atari,accept-rom-license]
!pip install opencv-python

In [None]:
import imageio
from IPython.display import Image, display
from io import BytesIO

# Code for visualizing the episode

class GIFMaker:
    def __init__(self):
        self.reset()

    def reset(self):
        self.images = []
        self.buffer = BytesIO()

    def append(self, img):
        self.images.append(img)

    def display(self):
        imageio.mimsave(self.buffer, self.images, format='gif')
        gif = Image(data=self.buffer.getvalue())
        display(gif)
        return gif

    def __len__(self):
      return len(self.images)

## Env1 CartPole-v1
Detailed information of this environment:
https://gymnasium.farama.org/environments/classic_control/cart_pole/

In [None]:
import gymnasium as gym
env = gym.make("CartPole-v1", render_mode="rgb_array")
g = GIFMaker() # visualization
observation, info = env.reset(seed=42)
for i in range(500):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(i, action, observation, reward)
    g.append(env.render()) # save one frame
    if terminated or truncated:
        observation, info = env.reset()
        break
g.display() # show GIF animation
env.close()

## Env2 MountainCar-v0
Detailed information of this environment: https://gymnasium.farama.org/environments/classic_control/mountain_car/

In [None]:
env = gym.make("MountainCar-v0", render_mode="rgb_array")

observation, info = env.reset(seed=42)
g = GIFMaker()
for i in range(200):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(i, action, observation, reward)
    g.append(env.render())
    if terminated or truncated:
        observation, info = env.reset()
        break
g.display()
env.close()

## Env3 Breakout, an Atari game
Detailed information of this environment: https://gymnasium.farama.org/environments/atari/breakout/

In [None]:
env = gym.make("ALE/Breakout-v5", render_mode="rgb_array")

observation, info = env.reset(seed=42)
g = GIFMaker()
for i in range(200):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(i, action, observation, reward)
    g.append(env.render())
    if terminated or truncated:
        observation, info = env.reset()
        break
g.display()
env.close()

## Env4 Boxing, an Atari game
Detailed information of this environment: https://gymnasium.farama.org/environments/atari/breakout/

In [None]:
env = gym.make("ALE/Boxing-v5", render_mode="rgb_array")

observation, info = env.reset(seed=42)
g = GIFMaker()
for i in range(200):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    # print(i, action, observation, reward)
    g.append(env.render())
    if terminated or truncated:
        observation, info = env.reset()
        break
g.display()
env.close()

##Import packages we need






In [None]:
import os
import gc
import math
import random

from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import deque

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import cv2

##Replay Buffer
Implemented a replay buffer with size n. (Different task/environment may use buffers in different sizes.)

In [None]:
class ReplayBuffer(object):
    def __init__(self, n):
        self.buffer = deque([], maxlen=n)


    def add(self, x):
        self.buffer.append(x) # add sample x to the buffer

    def sample(self, batch_size):
        if len(self.buffer) < batch_size:
            return []  # Return empty if not enough samples
        return random.sample(self.buffer, batch_size) # return a list of "packages" before

    def __len__(self):
        return len(self.buffer)

##DQN for numerical states

In [None]:
# MLP for Q function network
class QNetwork(nn.Module):
    def __init__(
        self,
        state_dim,
        action_dim
    ):
        super(QNetwork, self).__init__()
        # Define the layers of the MLP
        self.fc1 = nn.Linear(state_dim, 128)  # First fully connected layer
        self.fc2 = nn.Linear(128, 64)         # Second fully connected layer
        self.fc3 = nn.Linear(64, action_dim)  # Output layer
        # the output dimension should be action_dim
        # which gives us Q values for all actions

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.fc1(x))  # Apply ReLU activation function after first layer
        x = F.relu(self.fc2(x))  # Apply ReLU activation function after second layer
        x = self.fc3(x)          # Output layer (no activation function)

        return x

# DQN
class DQN():
    def __init__(
        self,
        env,
        env_name,
        eps=0.1,
        eps_decay_steps=20000,
        batch_size=64,
        train_freq=4,
        target_network_update_freq=200,
        train_start=2000,
        gamma=0.95,
    ):
        # initlize env and env name
        self.env = env
        self.env_name = env_name

        # set state dimension and number of actions
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        # create two networks: 1) Policy (online) network, 2) Target network
        self.policy_network = QNetwork(self.state_dim, self.action_dim)
        self.target_network = QNetwork(self.state_dim, self.action_dim)


        # make target network paramters same as policy network
        self.target_network.load_state_dict(self.policy_network.state_dict())

        # assign gpu if avliable
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # move the networks to device/gpu
        self.policy_network = self.policy_network.to(self.device)
        self.target_network = self.target_network.to(self.device)

        # Create Replay Buffer with size 50000
        self.rbuff = ReplayBuffer(50000)

        # (end) epsilon value for epsilon off policy (self.eps)
        self.eps = eps
        # how many steps epsilon decays from 1 to eps
        self.eps_decay_steps = eps_decay_steps

        # set optimizer and loss function
        self.optimizer = optim.Adam(self.policy_network.parameters())
        self.loss_function = nn.MSELoss()

        # batch size for sampling data from replay buffer
        self.batch_size = batch_size

        # mode is used to switch between optimization of DQN and Evaluation
        self.mode = "eval"
        self.total_transitions = 0
        self.train_freq = train_freq
        self.target_network_update_freq = target_network_update_freq
        self.train_start = train_start

        self.gamma = gamma
        self.g = None # for visualization


    def act(self, state):
        with torch.no_grad():
            if self.mode == "eval":
                # state_tensor = torch.tensor([state], device=self.device, dtype=torch.float32)
                q_values = self.policy_network(state)
                action = torch.argmax(q_values, dim=1).unsqueeze(0)
                return action
            elif self.mode == "train":
                if self.total_transitions < self.train_start:
                    # before actually learning, do random actions
                    action = torch.randint(0, self.action_dim, (state.size(0),), device=self.device, dtype=torch.long).unsqueeze(0)
                    return action

                # actual eps value for epsilon greedy
                # here we consider a linear epsilon decay
                # the actual eps value is 1 when total_transition is 0
                # the actual eps value is self.eps if it is greater than self.eps_decay_steps (i.e. the end of decaying)
                # in between them, just linearly decay the eps value according to how many steps (self.total_transitions) this DQN runs.
                if self.total_transitions > self.eps_decay_steps:
                    eps = self.eps
                else:
                    eps = 1 - (self.total_transitions / self.eps_decay_steps) * (1 - self.eps)

                # epsilon-greedy policy
                if np.random.rand() < eps:
                    action = torch.randint(0, self.action_dim, (state.size(0),), device=self.device, dtype=torch.long).unsqueeze(0)
                else:
                    q_values = self.policy_network(state)
                    action = torch.argmax(q_values, dim=1).unsqueeze(0)

                return action


    # optimizae the DQN
    def optimization(self):
        # make sure the policy network is train mode
        self.policy_network.train()

        # get a batch of state, action, next state, and reward and append them to convert to batch of tensors
        state, action, next_state, reward, not_done = list(zip(*self.rbuff.sample(self.batch_size)))
        state = torch.cat(state, dim=0).to(self.device)
        action = torch.cat(action, dim=0).to(self.device)
        next_state = torch.cat(next_state, dim=0).to(self.device)
        reward = torch.cat(reward, dim=0).to(self.device)
        not_done = torch.cat(not_done, dim=0).to(self.device)

        # Q value calculation, loss compute, and backpropagation
        q_values = self.policy_network(state)
        q_values = q_values.gather(1, action)
        with torch.no_grad():
            next_q_values = self.target_network(next_state)
            max_next_q_values = torch.max(next_q_values, dim=1).values
            target_q_values = reward + self.gamma * not_done * max_next_q_values
        loss = self.loss_function(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.total_transitions % self.target_network_update_freq == 0:
            self.target_network.load_state_dict(self.policy_network.state_dict())


    # run for one episode
    def run_one_episode(self):
        self.g = GIFMaker()

        # batch_size for training network
        batch_size = 64

        # initialize some values to track the end of current episode and cummulative reward
        terminated = False
        r = 0

        state, info = self.env.reset()
        # reset the environment
        self.g.append(env.render())

        while not terminated:
            # create state_tensor that converts from state, and make a batch dimension, move it to gpu
            state_tensor = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
            # create action_tensor which we follow the epsilon policy, use act()
            action_tensor = self.act(state_tensor)
            action = action_tensor.item()
            # take that action in the environment and observe the next state
            next_state, reward, terminated, truncated, info = self.env.step(action)
            # we need to use .item() on action_tensor to make it a number for the environment

            r += reward
            self.g.append(env.render())

            # create next_state_tensor as a tensor
            next_state_tensor = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(0)
            # create reward_tensor as a tensor
            reward_tensor = torch.tensor(reward, device=self.device, dtype=torch.float32).unsqueeze(0)
            # create a binary (0, 1) variable for not_done: 1 for not done and 0 for done
            if not terminated:
                not_done = torch.tensor([1.0], device=self.device, dtype=torch.float32).unsqueeze(0)
            else:
                not_done = torch.tensor([0.0], device=self.device, dtype=torch.float32).unsqueeze(0)

            if self.mode == "train":
                self.total_transitions += 1


                # add state, action, next state and reward tensors to buffer, using the add() method of the buffer class
                transition_package = [state_tensor.cpu(), action_tensor.cpu(), next_state_tensor.cpu(), reward_tensor.cpu(), not_done.cpu()]
                self.rbuff.add(transition_package)

                if len(self.rbuff) < self.train_start:
                    # don't train before train_start, just gathering more samples
                    continue
                else:
                    # we optimize the network for every other self.train_freq steps
                    if self.total_transitions % self.train_freq == 0:
                        self.optimization()

            # Terminated then return
            if terminated or truncated:
                return r

            state = next_state

        return r

    def train(self):
        self.mode = "train"
        r = self.run_one_episode()
        # print(f"Training Reward: {r:.2f}") # optionally print out reward of this episode

    # Evaluation of policy
    def eval(self, n):
        self.mode = "eval" # put in eval mode
        returns = []
        for i in range(n):                          # run evaluation for n episode
            returns.append(self.run_one_episode())
        return np.mean(returns)                    # return average returns over niter episodes


    # the function called to perform optimization and evaluation
    def execute(self, total_ep=5000, eval_freq=100, eval_ep=100):
        rewards = []   # used to track polciy evaluation across runs
        episodes = []  # number of episodes used to update policy

        prog_bar = tqdm(range(0, total_ep))
        for i in prog_bar:
            self.train()                        # train
            if (i+1) % eval_freq == 0:          # evaluate using eval_ep episodes every eval_freq policy updates
                reward = self.eval(eval_ep)
                print (f"Eval Reward: {reward:.2f}")
                rewards.append(reward)
                episodes.append(i)

        plt.plot(episodes, rewards)   # plot evaluation reward vs episodes
        plt.xlabel('episodes')
        plt.ylabel('Rewards')
        plt.title('DQN on '+self.env_name)
        plt.show()

    def visualize(self):
        self.g.display()

In [None]:
# Cart Pole DQN
gc.collect() # free some unused RAM, don't worry about this
env = gym.make("CartPole-v1", render_mode="rgb_array")
dqn_cartpole = DQN(env, "cartpole", eps=0.01, batch_size=64)
dqn_cartpole.execute(total_ep=500, eval_freq=100, eval_ep=100) # total_ep=5000 will take ~25 mins on GPU

In [None]:
cartpole_eval = dqn_cartpole.eval(100)
print(f"CartPole Final Evaluation: {cartpole_eval}")

In [None]:
# Mountain Car DQN
gc.collect() # free some unused RAM, don't worry about this
env = gym.make("MountainCar-v0", render_mode="rgb_array")
dqn_mountaincar = DQN(env, "mountaincar", eps=0.01, batch_size=64)
dqn_mountaincar.execute(total_ep=200, eval_freq=100, eval_ep=20) # 2000 episodes take ~34 mins

In [None]:
mountaincar_eval = dqn_mountaincar.eval(20)
print(f"MountainCar Final Evaluation: {mountaincar_eval}")

MountainCar Final Evaluation: -200.0


#DQN for image states (observations)

In [None]:
class QNetwork_Image(nn.Module):
    def __init__(self, num_actions, num_channels):
        super(QNetwork_Image, self).__init__()

        self.conv1 = nn.Conv2d(1,6,5)
        self.conv2 = nn.Conv2d(6, 16, 5)

        self.fc1 = nn.Linear(5184, 128)
        self.fc2 = nn.Linear(128, 84)
        self.fc3 = nn.Linear(84, num_actions)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.reshape(x, (x.size(0),-1) )

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def resize_and_normalize(state, image_size=(84, 84)):
    resized_img = cv2.resize(state, image_size)
    normalized_img = resized_img.astype(np.float32) / 255.0
    if len(normalized_img.shape) == 2:  # Grayscale
        normalized_img = normalized_img.reshape((1, image_size[0], image_size[1]))
    else:
        normalized_img = np.transpose(normalized_img, (2, 0, 1))
    return normalized_img

class DQN_Image():
    def __init__(
        self,
        env,
        env_name,
        eps=0.1,
        eps_decay_steps=20000,
        batch_size=64,
        train_freq=4,
        target_network_update_freq=200,
        train_start=2000,
        gamma=0.95,
    ):
        # initlize env and env name
        self.env = env
        self.env_name = env_name
         # set state dimension and number of actions
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        # create two networks: 1) Policy (online) network, 2) Target network
        self.policy_network = QNetwork_Image(self.action_dim, 1)
        self.target_network = QNetwork_Image(self.action_dim, 1)
        # make target network paramters same as policy network
        self.target_network.load_state_dict(self.policy_network.state_dict())
        self.target_network.eval()
        # assign gpu if avliable
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # move the networks to device/gpu
        self.policy_network.to(self.device)
        self.target_network.to(self.device)
        # Create Replay Buffer with size 50000
        self.rbuff = ReplayBuffer(50000)

        # (end) epsilon value for epsilon off policy (self.eps)
        self.eps = eps
        # how many steps epsilon decays from 1 to eps
        self.eps_decay_steps = eps_decay_steps

        # set optimizer and loss function
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=0.001)
        self.loss_func = nn.MSELoss()

        # batch size for sampling data from replay buffer
        self.batch_size = batch_size

        # mode is used to switch between optimization of DQN and Evaluation
        self.mode = "eval"
        self.total_transitions = 0
        self.train_freq = train_freq
        self.target_network_update_freq = target_network_update_freq
        self.train_start = train_start

        self.gamma = gamma
        self.g = None # for visualization
          # function to choose action given a state


    def act(self, state):
        with torch.no_grad():
            if self.mode == "eval":
                q_values = self.policy_network(state)
                action = torch.argmax(q_values, dim=1).unsqueeze(0)
                return action
            elif self.mode == "train":
                if self.total_transitions < self.train_start:
                    # before actually learning, do random actions
                    action = torch.randint(0, self.action_dim, (state.size(0),), device=self.device, dtype=torch.long).unsqueeze(0)
                    return action

                # actual eps value for epsilon greedy
                # here we consider a linear epsilon decay
                # the actual eps value is 1 when total_transition is 0
                # the actual eps value is self.eps if it is greater than self.eps_decay_steps (i.e. the end of decaying)
                # in between them, just linearly decay the eps value according to how many steps (self.total_transitions) this DQN runs.
                if self.total_transitions <= self.eps_decay_steps:
                    eps = 1 + ((self.eps - 1)/(self.eps_decay_steps))*self.total_transitions
                else:
                    eps = self.eps


                # epsilon-greedy policy
                # return an action according to eps
                if np.random.rand() < eps:
                    action = torch.randint(0, self.action_dim, (state.size(0),), device=self.device, dtype=torch.long).unsqueeze(0)
                else:
                    q_values = self.policy_network(state)
                    action = torch.argmax(q_values, dim=1).unsqueeze(0)

                return action

    # optimizae the DQN
    def optimization(self):
        # make sure the policy network is train mode
        self.policy_network.train()

        # get a batch of state, action, next state, and reward and append them to convert to batch of tensors
        state, action, next_state, reward, not_done = list(zip(*self.rbuff.sample(self.batch_size)))

        state = torch.cat(state, dim=0).to(self.device)
        # given the example of state, please complete what should we do for action, next_state, reward, and not_done
        action = torch.cat(action, dim=0).to(self.device)
        next_state = torch.cat(next_state, dim=0).to(self.device)
        reward = torch.cat(reward, dim=0).to(self.device)
        not_done = torch.cat(not_done, dim=0).to(self.device)

        q_values = self.policy_network(state)
        q_values = q_values.gather(1, action)
        with torch.no_grad():
            next_q_values = self.target_network(next_state)
            max_next_q_values = torch.max(next_q_values, dim=1).values
            target_q_values = reward + self.gamma * not_done * max_next_q_values
        loss = self.loss_func(q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update the target network based on new policy network paramters
        if self.total_transitions % self.target_network_update_freq == 0:
            self.target_network.load_state_dict(self.policy_network.state_dict())

        return loss.item()

    # run for one episode
    def run_one_episode(self):
        self.g = GIFMaker()

        # batch_size for training network
        batch_size = 64

        # initialize some values to track the end of current episode and cummulative reward
        terminated = False
        r = 0


        # reset the environment
        state,info = self.env.reset()
        state = resize_and_normalize(state)
        self.g.append(env.render())

        while not terminated:
            # create state_tensor that converts from state, and make a batch dimension, move it to gpu
            # create action_tensor which we follow the epsilon policy, use act()
            # take that action in the environment and observe the next state

            state_tensor = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
            action_tensor = self.act(state_tensor)
            action = action_tensor.item()

            next_state, reward, terminated, truncated, info = self.env.step(action)
            next_state = resize_and_normalize(next_state)

            r += reward
            self.g.append(env.render())


            # create next_state_tensor as a tensor
            # create reward_tensor as a tensor
            # create a binary (0, 1) variable for not_done: 1 for not done and 0 for done
            next_state_tensor = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(0)
            reward_tensor = torch.tensor(reward, device=self.device, dtype=torch.float32).unsqueeze(0)

            if not terminated:
                not_done = torch.tensor([1.0], device=self.device, dtype=torch.float32).unsqueeze(0)
            else:
                not_done = torch.tensor([0.0], device=self.device, dtype=torch.float32).unsqueeze(0)


            if self.mode == "train":
                self.total_transitions += 1
                # add state, action, next state and reward tensors to buffer, using the add() method of the buffer class
                transition_package = [state_tensor.cpu(), action_tensor.cpu(), next_state_tensor.cpu(), reward_tensor.cpu(), not_done.cpu()]
                self.rbuff.add(transition_package)

                if len(self.rbuff) < self.train_start:
                    # don't train before train_start, just gathering more samples
                    continue
                else:
                    # we optimize the network for every other self.train_freq steps
                    if self.total_transitions % self.train_freq == 0:
                        loss = self.optimization()

            # Terminated then return
            if terminated or truncated:
                return r

            # assign next_state to state for the next loop
            state = next_state

    def train(self):
        self.mode = "train"
        r = self.run_one_episode()
        # print(f"Training Reward: {r:.2f}") # optionally print out reward of this episode in case you are boring

    # Evaluation of policy
    def eval(self, n):
        self.mode = "eval" # put in eval mode
        returns = []
        for i in range(n):                          # run evaluation for n episode
            returns.append(self.run_one_episode())
        return np.mean(returns)                    # return average returns over niter episodes


    # the function called to perform optimization and evaluation
    def execute(self, total_ep=5000, eval_freq=100, eval_ep=100):
        rewards = []   # used to track polciy evaluation across runs
        episodes = []  # number of episodes used to update policy

        prog_bar = tqdm(range(0, total_ep))
        for i in prog_bar:
            self.train()                        # train
            print(i)
            if (i+1) % eval_freq == 0:          # evaluate using eval_ep episodes every eval_freq policy updates
                reward = self.eval(eval_ep)
                print (f"Eval Reward: {reward:.2f}")
                rewards.append(reward)
                episodes.append(i)

        plt.plot(episodes, rewards)   # plot evaluation reward vs episodes
        plt.xlabel('episodes')
        plt.ylabel('Rewards')
        plt.title('DQN on '+self.env_name)
        plt.show()

    def visualize(self):
        self.g.display()



In [None]:
# Breakout DQN
# breakout could be a bit hard to learn
gc.collect()
env = gym.make("ALE/Breakout-v5", render_mode="rgb_array", obs_type="grayscale") # optionally you can try obs_type=rgb, which gives you rgb images. if using rgb images, you may need to adjust the size of replay buffer to avoid out-of-memory (RAM)
dqn_image = DQN_Image(env, "breakout", eps=0.01, batch_size=64)
dqn_image.execute(total_ep=850, eval_freq=100, eval_ep=10) # 1000 episodes takes ~60mins on GPU

In [None]:
breakout_eval = dqn_image.eval(20)
print(f"Breakout Final Evaluation: {breakout_eval}")

In [None]:
# Boxing DQN
gc.collect()
env = gym.make("ALE/Boxing-v5", render_mode="rgb_array", obs_type="grayscale") # optionally you can try obs_type=rgb, which gives you rgb images. if using rgb images, you may need to adjust the size of replay buffer to avoid out-of-memory (RAM)
dqn_image = DQN_Image(env, "boxing", eps=0.01, batch_size=64)
dqn_image.execute(total_ep=500, eval_freq=100, eval_ep=10) # 500 episodes takes ~53mins on GPU

In [None]:
boxing_eval = dqn_image.eval(20)
print(f"Boxing Final Evaluation: {boxing_eval}")