## Tutorial: Implementing and Improving Deep Q-Networks (DQN)

### 1. Implementation of Deep Q-Networks (DQN)
Description:
Deep Q-Networks (DQN) use neural networks to approximate the Q-value function in reinforcement learning, which predicts the total reward an agent can expect to receive, starting from a given state and taking a specific action. The DQN algorithm uses experience replay and target networks to stabilize training.

In [None]:
import gym
import torch
from torch.autograd import Variable
import random
import matplotlib.pyplot as plt

In [None]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, s, y):
        """
        Updates DQN weights given a training sample.
        @param s: state
        @param y: target value
        """
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        """
        Predicts Q-values for all actions in a given state using the trained model.
        @param s: input state
        @return: Q-values for all actions
        """
        with torch.no_grad():
            return self.model(torch.Tensor(s))

In [None]:
env = gym.envs.make("MountainCar-v0")

In [None]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function

In [None]:
def q_learning(env, estimator, n_episode, gamma=1.0, epsilon=0.1, epsilon_decay=0.99):
    """
    Deep Q-learning using DQN.
    @param env: Gym environment
    @param estimator: DQN estimator object
    @param n_episode: number of episodes
    @param gamma: discount factor
    @param epsilon: epsilon-greedy strategy parameter
    @param epsilon_decay: epsilon decay rate
    """
    total_reward_episode = [0] * n_episode

    for episode in range(n_episode):
        policy = gen_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
        state = env.reset()
        is_done = False
        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env.step(action)
            total_reward_episode[episode] += reward

            # Modify reward based on the position of the car
            modified_reward = reward + (next_state[0] + 0.5)
            if next_state[0] >= 0.5:
                modified_reward += 100
            elif next_state[0] >= 0.25:
                modified_reward += 20
            elif next_state[0] >= 0.1:
                modified_reward += 10
            elif next_state[0] >= 0:
                modified_reward += 5

            q_values = estimator.predict(state).tolist()

            if is_done:
                q_values[action] = modified_reward
            else:
                q_values_next = estimator.predict(next_state)
                q_values[action] = modified_reward + gamma * torch.max(q_values_next).item()

            estimator.update(state, q_values)
            state = next_state

        print(f'Episode: {episode}, Total reward: {total_reward_episode[episode]}, Epsilon: {epsilon}')
        epsilon = max(epsilon * epsilon_decay, 0.01)

    return total_reward_episode

In [None]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50
lr = 0.001
dqn = DQN(n_state, n_action, n_hidden, lr)

n_episode = 1000
total_reward_episode = q_learning(env, dqn, n_episode, gamma=0.99, epsilon=0.3)

In [None]:
plt.plot(total_reward_episode)
plt.title('Total Reward per Episode over Time')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

### 2. Improving DQN with Experience Replay

Description:
Experience replay improves the stability and efficiency of DQN by storing the agent's experiences in a replay buffer and sampling random batches from it for training. This helps in breaking the correlation between consecutive experiences and provides more diverse training data.

In [None]:
import gym
import torch
from collections import deque
import random
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [None]:
env = gym.envs.make("MountainCar-v0")

In [None]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, s, y):
        """
        Updates the DQN weights given a training sample.
        @param s: state
        @param y: target value
        """
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        """
        Predicts Q-values for all actions in a given state using the trained model.
        @param s: input state
        @return: Q-values for all actions
        """
        with torch.no_grad():
            return self.model(torch.Tensor(s))

    def replay(self, memory, replay_size, gamma):
        """
        Experience replay.
        @param memory: experience replay buffer
        @param replay_size: number of samples to use for model update
        @param gamma: discount factor
        """
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states = []
            td_targets = []
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action] = reward
                else:
                    q_values_next = self.predict(next_state)
                    q_values[action] = reward + gamma * torch.max(q_values_next).item()
                td_targets.append(q_values)

            self.update(states, td_targets)

In [None]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function

In [None]:
def q_learning(env, estimator, n_episode, replay_size, gamma=1.0, epsilon=0.1, epsilon_decay=0.99):
    """
    Deep Q-learning with DQN and experience replay.
    @param env: Gym environment
    @param estimator: DQN estimator object
    @param replay_size: number of samples to use for model update
    @param n_episode: number of episodes
    @param gamma: discount factor
    @param epsilon: epsilon-greedy strategy parameter
    @param epsilon_decay: epsilon decay rate
    """
    total_reward_episode = [0] * n_episode
    memory = deque(maxlen=10000)

    for episode in range(n_episode):
        policy = gen_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
        state = env.reset()
        is_done = False
        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env.step(action)
            total_reward_episode[episode] += reward

            # Modify reward based on the position of the car
            modified_reward = reward + (next_state[0] + 0.5)
            if next_state[0] >= 0.5:
                modified_reward += 100
            elif next_state[0] >= 0.25:
                modified_reward += 20
            elif next_state[0] >= 0.1:
                modified_reward += 10
            elif next_state[0] >= 0:
                modified_reward += 5

            memory.append((state, action, next_state, modified_reward, is_done))
            if is_done:
                break
            estimator.replay(memory, replay_size, gamma)
            state = next_state

        print(f'Episode: {episode}, Total reward: {total_reward_episode[episode]}, Epsilon: {epsilon}')
        epsilon = max(epsilon * epsilon_decay, 0.01)

    return total_reward_episode

In [None]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50
lr = 0.001
dqn = DQN(n_state, n_action, n_hidden, lr)

n_episode = 600
replay_size = 20
total_reward_episode = q_learning(env, dqn, n_episode, replay_size, gamma=0.9, epsilon=0.3)

In [None]:
plt.plot(total_reward_episode)
plt.title('Total Reward per Episode over Time')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

### 3. Implementing Double DQN

Description:
Double DQN addresses the overestimation bias in Q-learning by decoupling the selection of actions and the evaluation of Q-values. It uses two networks: a primary network for selecting actions and a target network for evaluating the Q-values of those actions.

In [None]:
import gym
import torch
from collections import deque
import random
import copy
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [None]:
env = gym.envs.make("MountainCar-v0")

In [None]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.01):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
        self.model_target = copy.deepcopy(self.model)

    def target_predict(self, s):
        """
        Predicts Q-values using the target network.
        @param s: input state
        @return: target Q-values for all actions
        """
        with torch.no_grad():
            return self.model_target(torch.Tensor(s))

    def copy_target(self):
        """
        Copies the parameters from the main network to the target network.
        """
        self.model_target.load_state_dict(self.model.state_dict())

    def replay(self, memory, replay_size, gamma):
        """
        Experience replay with the target network.
        @param memory: experience replay buffer
        @param replay_size: number of samples to use for model update
        @param gamma: discount factor
        """
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states = []
            td_targets = []
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action] = reward
                else:
                    q_values_next = self.target_predict(next_state).detach()
                    q_values[action] = reward + gamma * torch.max(q_values_next).item()

                td_targets.append(q_values)

            self.update(states, td_targets)

    def update(self, s, y):
        """
        Updates DQN weights given a training sample.
        @param s: state
        @param y: target value
        """
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        """
        Predicts Q-values for all actions in a given state using the trained model.
        @param s: input state
        @return: Q-values for all actions
        """
        with torch.no_grad():
            return self.model(torch.Tensor(s))

In [None]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    """
    Generates an epsilon-greedy policy.
    @param estimator: DQN estimator
    @param epsilon: exploration rate
    @param n_action: number of actions
    @return: epsilon-greedy policy function
    """
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function

In [None]:
def q_learning(env, estimator, n_episode, replay_size, target_update=10, gamma=1.0, epsilon=0.1, epsilon_decay=0.99):
    """
    Double DQN with experience replay.
    @param env: Gym environment
    @param estimator: DQN estimator
    @param replay_size: number of samples for model update
    @param target_update: update target network every `target_update` episodes
    @param n_episode: number of episodes
    @param gamma: discount factor
    @param epsilon: epsilon-greedy strategy parameter
    @param epsilon_decay: epsilon decay rate
    """
    total_reward_episode = [0] * n_episode
    memory = deque(maxlen=10000)

    for episode in range(n_episode):
        if episode % target_update == 0:
            estimator.copy_target()
        policy = gen_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
        state = env.reset()
        is_done = False

        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env.step(action)
            total_reward_episode[episode] += reward

            # Modify reward based on the position of the car
            modified_reward = reward + (next_state[0] + 0.5)
            if next_state[0] >= 0.5:
                modified_reward += 100
            elif next_state[0] >= 0.25:
                modified_reward += 20
            elif next_state[0] >= 0.1:
                modified_reward += 10
            elif next_state[0] >= 0:
                modified_reward += 5

            memory.append((state, action, next_state, modified_reward, is_done))

            if is_done:
                break

            estimator.replay(memory, replay_size, gamma)
            state = next_state
        print(f'Episode: {episode}, Total reward: {total_reward_episode[episode]}, Epsilon: {epsilon}')
        epsilon = max(epsilon * epsilon_decay, 0.01)

    return total_reward_episode

In [None]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50
lr = 0.01
dqn = DQN(n_state, n_action, n_hidden, lr)

n_episode = 600
replay_size = 20
target_update = 10

total_reward_episode = q_learning(env, dqn, n_episode, replay_size, target_update=target_update, gamma=0.9, epsilon=0.3)

In [None]:
plt.plot(total_reward_episode)
plt.title('Total Reward per Episode over Time')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

### 4. Hyperparameter Tuning for Double DQN in CartPole Environment

Description:
Hyperparameter tuning involves experimenting with different values for the hyperparameters of a model to find the combination that results in the best performance. For Double DQN, important hyperparameters include learning rate, hidden layer size, replay buffer size, and target network update frequency.

In [None]:
import gym
import torch
from collections import deque
import random
import copy
from torch.autograd import Variable
import numpy as np

In [None]:
env = gym.envs.make("CartPole-v0")

In [None]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.01):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(n_state, n_hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(n_hidden, n_action)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
        self.model_target = copy.deepcopy(self.model)

    def target_predict(self, s):
        """
        Predicts Q-values using the target network.
        @param s: input state
        @return: target Q-values for all actions
        """
        with torch.no_grad():
            return self.model_target(torch.Tensor(s))

    def copy_target(self):
        """
        Copies the parameters from the main network to the target network.
        """
        self.model_target.load_state_dict(self.model.state_dict())

    def replay(self, memory, replay_size, gamma):
        """
        Experience replay with the target network.
        @param memory: experience replay buffer
        @param replay_size: number of samples to use for model update
        @param gamma: discount factor
        """
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states = []
            td_targets = []
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action] = reward
                else:
                    q_values_next = self.target_predict(next_state).detach()
                    q_values[action] = reward + gamma * torch.max(q_values_next).item()

                td_targets.append(q_values)

            self.update(states, td_targets)

    def update(self, s, y):
        """
        Updates DQN weights given a training sample.
        @param s: state
        @param y: target value
        """
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        """
        Predicts Q-values for all actions in a given state using the trained model.
        @param s: input state
        @return: Q-values for all actions
        """
        with torch.no_grad():
            return self.model(torch.Tensor(s))

In [None]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    """
    Generates an epsilon-greedy policy.
    @param estimator: DQN estimator
    @param epsilon: exploration rate
    @param n_action: number of actions
    @return: epsilon-greedy policy function
    """
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function

In [None]:
def q_learning(env, estimator, n_episode, replay_size, target_update=10, gamma=1.0, epsilon=0.1, epsilon_decay=0.99):
    """
    Double DQN with experience replay.
    @param env: Gym environment
    @param estimator: DQN estimator
    @param replay_size: number of samples for model update
    @param target_update: update target network every `target_update` episodes
    @param n_episode: number of episodes
    @param gamma: discount factor
    @param epsilon: epsilon-greedy strategy parameter
    @param epsilon_decay: epsilon decay rate
    """
    total_reward_episode = [0] * n_episode
    memory = deque(maxlen=10000)

    for episode in range(n_episode):
        if episode % target_update == 0:
            estimator.copy_target()
        policy = gen_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
        state = env.reset()
        is_done = False

        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env.step(action)
            total_reward_episode[episode] += reward

            memory.append((state, action, next_state, reward, is_done))

            if is_done:
                break

            estimator.replay(memory, replay_size, gamma)
            state = next_state

        epsilon = max(epsilon * epsilon_decay, 0.01)

    return total_reward_episode

In [None]:
# Hyperparameter options
n_hidden_options = [30, 40]
lr_options = [0.001, 0.003]
replay_size_options = [20, 25]
target_update_options = [30, 35]

n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_episode = 600
last_episode = 200

# Grid search for hyperparameter tuning
results = []

for n_hidden in n_hidden_options:
    for lr in lr_options:
        for replay_size in replay_size_options:
            for target_update in target_update_options:
                env.seed(1)
                random.seed(1)
                torch.manual_seed(1)
                dqn = DQN(n_state, n_action, n_hidden, lr)
                total_reward_episode = q_learning(env, dqn, n_episode, replay_size, target_update, gamma=0.9, epsilon=1)
                average_reward = sum(total_reward_episode[-last_episode:]) / last_episode
                results.append((n_hidden, lr, replay_size, target_update, average_reward))
                print(f"n_hidden: {n_hidden}, lr: {lr}, replay_size: {replay_size}, target_update: {target_update}, average_reward: {average_reward}")

# Displaying the best result
best_params = max(results, key=lambda x: x[4])
print(f"Best parameters: n_hidden: {best_params[0]}, lr: {best_params[1]}, replay_size: {best_params[2]}, target_update: {best_params[3]}, average_reward: {best_params[4]}")

# Plotting the best result
env.seed(1)
random.seed(1)
torch.manual_seed(1)
dqn = DQN(n_state, n_action, best_params[0], best_params[1])
total_reward_episode = q_learning(env, dqn, n_episode, best_params[2], target_update=best_params[3], gamma=0.9, epsilon=1)

In [None]:
plt.plot(total_reward_episode)
plt.title('Total Reward per Episode over Time')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

  logger.warn(
  deprecation(
  deprecation(


### 5. Implementing Dueling DQN

Description:
Dueling DQN introduces a new neural network architecture that separately estimates the state value function and the advantage function for each action. This allows the agent to learn which states are valuable without needing to know the value of each action in those states.

In [None]:
import gym
import torch
from collections import deque
import random
from torch.autograd import Variable
import torch.nn as nn
import matplotlib.pyplot as plt

In [None]:
env = gym.envs.make("MountainCar-v0")

In [None]:
class DuelingModel(nn.Module):
    def __init__(self, n_input, n_output, n_hidden):
        super(DuelingModel, self).__init__()
        self.adv1 = nn.Linear(n_input, n_hidden)
        self.adv2 = nn.Linear(n_hidden, n_output)
        self.val1 = nn.Linear(n_input, n_hidden)
        self.val2 = nn.Linear(n_hidden, 1)

    def forward(self, x):
        adv = nn.functional.relu(self.adv1(x))
        adv = self.adv2(adv)
        val = nn.functional.relu(self.val1(x))
        val = self.val2(val)
        return val + adv - adv.mean()

In [None]:
class DQN():
    def __init__(self, n_state, n_action, n_hidden=50, lr=0.001):
        self.criterion = torch.nn.MSELoss()
        self.model = DuelingModel(n_state, n_action, n_hidden)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)

    def update(self, s, y):
        """
        Updates the DQN weights given a training sample.
        @param s: state
        @param y: target value
        """
        y_pred = self.model(torch.Tensor(s))
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def predict(self, s):
        """
        Predicts Q-values for all actions in a given state using the trained model.
        @param s: input state
        @return: Q-values for all actions
        """
        with torch.no_grad():
            return self.model(torch.Tensor(s))

    def replay(self, memory, replay_size, gamma):
        """
        Experience replay.
        @param memory: experience replay buffer
        @param replay_size: number of samples to use for model update
        @param gamma: discount factor
        """
        if len(memory) >= replay_size:
            replay_data = random.sample(memory, replay_size)
            states = []
            td_targets = []
            for state, action, next_state, reward, is_done in replay_data:
                states.append(state)
                q_values = self.predict(state).tolist()
                if is_done:
                    q_values[action] = reward
                else:
                    q_values_next = self.predict(next_state)
                    q_values[action] = reward + gamma * torch.max(q_values_next).item()
                td_targets.append(q_values)

            self.update(states, td_targets)

In [None]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    """
    Generates an epsilon-greedy policy.
    @param estimator: DQN estimator
    @param epsilon: exploration rate
    @param n_action: number of actions
    @return: epsilon-greedy policy function
    """
    def policy_function(state):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            q_values = estimator.predict(state)
            return torch.argmax(q_values).item()
    return policy_function

In [None]:
def q_learning(env, estimator, n_episode, replay_size, gamma=1.0, epsilon=0.1, epsilon_decay=0.99):
    """
    Deep Q-learning with DQN and experience replay.
    @param env: Gym environment
    @param estimator: DQN estimator object
    @param replay_size: number of samples for model update
    @param n_episode: number of episodes
    @param gamma: discount factor
    @param epsilon: epsilon-greedy strategy parameter
    @param epsilon_decay: epsilon decay rate
    """
    total_reward_episode = [0] * n_episode
    memory = deque(maxlen=10000)

    for episode in range(n_episode):
        policy = gen_epsilon_greedy_policy(estimator, epsilon, env.action_space.n)
        state = env.reset()
        is_done = False

        while not is_done:
            action = policy(state)
            next_state, reward, is_done, _ = env.step(action)
            total_reward_episode[episode] += reward

            # Modify reward based on the position of the car
            modified_reward = reward + (next_state[0] + 0.5)
            if next_state[0] >= 0.5:
                modified_reward += 100
            elif next_state[0] >= 0.25:
                modified_reward += 20
            elif next_state[0] >= 0.1:
                modified_reward += 10
            elif next_state[0] >= 0:
                modified_reward += 5

            memory.append((state, action, next_state, modified_reward, is_done))

            if is_done:
                break

            estimator.replay(memory, replay_size, gamma)
            state = next_state

        print(f'Episode: {episode}, Total reward: {total_reward_episode[episode]}, Epsilon: {epsilon}')
        epsilon = max(epsilon * epsilon_decay, 0.01)

    return total_reward_episode

In [None]:
n_state = env.observation_space.shape[0]
n_action = env.action_space.n
n_hidden = 50
lr = 0.001
dqn = DQN(n_state, n_action, n_hidden, lr)

n_episode = 600
replay_size = 20

total_reward_episode = q_learning(env, dqn, n_episode, replay_size, gamma=0.9, epsilon=0.3)

In [None]:
plt.plot(total_reward_episode)
plt.title('Total Reward per Episode over Time')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.show()

  deprecation(
  deprecation(


# Summary
Deep Q-Networks (DQN): Use neural networks to approximate Q-values, enabling agents to learn optimal policies from high-dimensional state spaces.
Experience Replay: Stabilizes DQN training by storing experiences and sampling random batches for training, reducing correlations between consecutive updates.
Double DQN: Reduces overestimation bias by decoupling action selection and evaluation, using separate primary and target networks.
Hyperparameter Tuning for Double DQN: Involves systematically experimenting with different hyperparameters to optimize performance in a specific environment (e.g., CartPole).
Dueling DQN: Improves Q-value estimation by separating state value and advantage functions, allowing better generalization and learning efficiency.