# Deep Q-Learning Algorithm 

Simple Q-learning is basic and is all about learning state action value for all possible 

### Algorithm 
![alt text](sampling-training.jpg)

In [1]:
import gym
import random
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [2]:
class DQN(nn.Module):
    def __init__(self, hidden_units, state_size, action_size):
        super(DQN, self).__init__()
        self.hidden_units = hidden_units
        self.input_dim = state_size
        self.output_dim = action_size

        self.q_cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            
        )
    
    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        return self.q_netwrok(x)




In [1]:
# env = gym.make("CartPole-v1", new_step_api=True)
import gymnasium as gym
env = gym.make('ALE/Breakout-v5')
state_size = env.observation_space.shape[0]
action_size = env.action_space
action_size = 6
# Hyperparameters
gamma = 0.99             
epsilon = 1.0           
epsilon_min = 0.01
epsilon_decay = 0.995
learning_rate = 0.001
batch_size = 64
memory_size = 10000

NamespaceNotFound: Namespace ALE not found. Have you installed the proper package for ALE?

In [4]:
!pip uninstall gym atari_py

^C


In [5]:
!pip install gymnasium[atari]

Defaulting to user installation because normal site-packages is not writeable
Collecting ale_py>=0.9 (from gymnasium[atari])
  Downloading ale_py-0.11.1-cp310-cp310-win_amd64.whl.metadata (9.2 kB)
Downloading ale_py-0.11.1-cp310-cp310-win_amd64.whl (3.5 MB)
   ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
   --- ------------------------------------ 0.3/3.5 MB ? eta -:--:--
   ------------------ --------------------- 1.6/3.5 MB 4.7 MB/s eta 0:00:01
   ---------------------------------------- 3.5/3.5 MB 7.6 MB/s eta 0:00:00
Installing collected packages: ale_py
  Attempting uninstall: ale_py
    Found existing installation: ale-py 0.7.5
    Uninstalling ale-py-0.7.5:
      Successfully uninstalled ale-py-0.7.5
Successfully installed ale_py-0.11.1


  You can safely remove it manually.


In [12]:
memory = deque(maxlen=memory_size)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

target_network = DQN(24, state_size=state_size, action_size=action_size).to(device)
q_network = DQN(24, state_size=state_size, action_size=action_size).to(device)
target_network.load_state_dict(q_network.state_dict())

target_network.eval()
optimizer = optim.Adam(q_network.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

In [14]:
def epsilon_greedy_action(state, epsilon=epsilon):
    if random.random() < epsilon:
        return random.choice(range(action_size))
    else:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = q_network(state)
            
        return q_values.argmax().item()

In [15]:
def replay():
    if len(memory) < batch_size:
        return 
    
    minibatch = random.sample(memory, batch_size)

    current_states, actions, rewards, next_states, dones = zip(*minibatch)

    current_states = torch.FloatTensor(current_states).to(device=device)
    actions = torch.LongTensor(actions).unsqueeze(1).to(device)
    rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

    current_q = q_network(current_states).gather(1, actions)

    pred_q = target_network(next_states).max(1)[0].detach().unsqueeze(1)
    target_q = rewards + (gamma * pred_q * (1 - dones))

    loss = loss_fn(current_q, target_q) # current_q --> [64, 2]    target_q = [64, 1]

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [16]:
episodes = 200
target_update_freq = 1

for episode in tqdm(range(episodes)):
    reset_result = env.reset()
    state = reset_result[0] if isinstance(reset_result, tuple) else reset_result
    total_reward = 0

    for t in range(500):
        action = epsilon_greedy_action(state, epsilon)
        step_result = env.step(action)
        if len(step_result) == 5:
            next_state, reward, terminated, truncated, _ = step_result
            done = terminated or truncated
        else:
            next_state, reward, done, _ = step_result

        memory.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        replay()
        if done:
            break

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    if episode % target_update_freq == 0:
        target_network.load_state_dict(q_network.state_dict())
    
    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

  0%|          | 0/200 [00:00<?, ?it/s]


ValueError: Action dimension mismatch. Expected (6,), found ()

In [None]:
# Assuming 'q_network' is your model and training is finished
import torch

torch.save(q_network.state_dict(), 'q_network_weights.pth')

In [None]:
import gymnasium as gym
import torch
import time
import numpy as np

# --- Assume your QNetwork and other necessary components are defined here ---
# from your_model_file import QNetwork # Make sure to import your network architecture

# --- Hyperparameters and Setup ---
# Load the environment with rendering enabled
env = gym.make('CartPole-v1', render_mode='human') # <-- IMPORTANT

# Initialize your Q-network and load the trained weights
# Ensure the state_size and action_size match your environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
q_network = DQN(24, state_size, action_size) # Your network class
q_network.load_state_dict(torch.load('q_network_weights.pth'))
q_network.eval() # Set the network to evaluation mode

# --- The Greedy Action Function ---
def greedy_action(state):
    """Chooses the best action based on the Q-network's prediction."""
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = q_network(state_tensor)
        return np.argmax(q_values.cpu().data.numpy())

# --- The Visual Testing Loop ---
num_test_episodes = 10

for episode in range(num_test_episodes):
    reset_result = env.reset()
    state = reset_result[0] if isinstance(reset_result, tuple) else reset_result
    total_reward = 0
    terminated = False
    truncated = False

    while not (terminated or truncated):
        # Render the environment
        env.render()

        # Agent chooses the best action (no epsilon)
        action = greedy_action(state)

        # Environment takes a step
        step_result = env.step(action)

        # Unpack the step result
        if len(step_result) == 5:
            next_state, reward, terminated, truncated, _ = step_result
        else: # For older gym versions
            next_state, reward, done, _ = step_result
            terminated = done
            truncated = done # Simplification for older API

        state = next_state
        total_reward += reward

        # Optional: Add a small delay to make the visualization easier to follow
        time.sleep(0.02)

    print(f"Test Episode {episode + 1}, Total Reward: {total_reward}")

env.close()

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  x = torch.tensor(x, dtype=torch.float32)


Test Episode 1, Total Reward: 500.0
Test Episode 2, Total Reward: 166.0
Test Episode 3, Total Reward: 261.0
Test Episode 4, Total Reward: 197.0


KeyboardInterrupt: 

In [None]:
env.reset()

(array([-0.00681502,  0.03622616,  0.04379575,  0.03946735], dtype=float32),
 {})