# Deep Q-learning

Q-learning is great, because we have a convergence / optimality statement (Bellman equation). But severely limited to small environments, because storing the Q-table in memory is prohibitive. Also not very well suited for continuous spaces, but we'll get to that later.

We'll still use the same sort of environment for now, slightly different implementation

In [1]:
import numpy as np

class GridWorld:
    def __init__(self, size=(3, 3), goal=(2, 2)):
        self.size = size
        self.goal = goal
        self.state = (0, 0)  # Start at the top-left corner
        self.actions = ['up', 'down', 'left', 'right']
    
    def reset(self):
        """Reset the environment to the initial state."""
        self.state = (0, 0)
        return self.state
    
    def step(self, action):
        """Take an action and return the next state, reward, and if done."""
        if action == 'up':
            self.state = (max(0, self.state[0] - 1), self.state[1])
        elif action == 'down':
            self.state = (min(self.size[0] - 1, self.state[0] + 1), self.state[1])
        elif action == 'left':
            self.state = (self.state[0], max(0, self.state[1] - 1))
        elif action == 'right':
            self.state = (self.state[0], min(self.size[1] - 1, self.state[1] + 1))

        # Reward of -1 for each step, +10 for reaching the goal
        reward = 10 if self.state == self.goal else -1
        done = self.state == self.goal
        return self.state, reward, done

    def render(self):
        """Visualize the gridworld."""
        grid = np.full(self.size, '.')
        grid[self.goal] = 'G'  # Goal location
        grid[self.state] = 'A'  # Agent location
        print('\n'.join([' '.join(row) for row in grid]))


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

class DQNAgent:
    def __init__(self, state_dim, action_dim, gamma=0.9, epsilon=0.1, learning_rate=0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.learning_rate = learning_rate

        # Define the Q-network (simple feed-forward neural network)
        self.model = nn.Sequential(
            nn.Linear(state_dim, 24),
            nn.ReLU(),
            nn.Linear(24, 24),
            nn.ReLU(),
            nn.Linear(24, action_dim)
        )
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.loss_fn = nn.MSELoss()

        # Define possible actions
        self.actions = ['up', 'down', 'left', 'right']
    
    def act(self, state):
        """Select an action using epsilon-greedy policy."""
        # TODO

    def learn(self, state, action, reward, next_state, done):
        """Update the Q-network based on the observed transition."""
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)

        # Get Q-values for current and next state
        # ... TODO
        q_values = ...
        next_q_values = ...

        # Compute the target Q-value (TD target) (if done, we don't need the next_q)
        target = reward + (1 - done) * self.gamma * torch.max(next_q_values)
        
        # Update the Q-value for the taken action (i.e. set what they should be)
        q_values[0][action] = target

        # Compute the loss
        loss = self.loss_fn(q_values, self.model(state_tensor))
        
        # Backpropagate and optimize
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

### Training Loop

In [9]:
env = GridWorld()
agent = DQNAgent(state_dim=2, action_dim=4)

num_episodes = 100
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0
    
    while not done:
        # Render the environment (optional)
        #env.render()
        
        # Choose an action
        action = # ... 
        
        # Take the action in the environment
        next_state, reward, done = env.step( agent.actions[action] )
        
        # Learn from the experience
        agent.learn(state, action, reward, next_state, done)
        
        state = next_state
        total_reward += reward
    
    print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")

Episode 1/100, Total Reward: -2
Episode 2/100, Total Reward: -226
Episode 3/100, Total Reward: -37
Episode 4/100, Total Reward: -37
Episode 5/100, Total Reward: 7
Episode 6/100, Total Reward: 7
Episode 7/100, Total Reward: 7
Episode 8/100, Total Reward: 5
Episode 9/100, Total Reward: 7
Episode 10/100, Total Reward: 5
Episode 11/100, Total Reward: 7
Episode 12/100, Total Reward: 7
Episode 13/100, Total Reward: 7
Episode 14/100, Total Reward: 7
Episode 15/100, Total Reward: -5
Episode 16/100, Total Reward: -5
Episode 17/100, Total Reward: 7
Episode 18/100, Total Reward: 5
Episode 19/100, Total Reward: 7
Episode 20/100, Total Reward: 7
Episode 21/100, Total Reward: 6
Episode 22/100, Total Reward: 7
Episode 23/100, Total Reward: 6
Episode 24/100, Total Reward: 7
Episode 25/100, Total Reward: 7
Episode 26/100, Total Reward: 6
Episode 27/100, Total Reward: 7
Episode 28/100, Total Reward: 7
Episode 29/100, Total Reward: 6
Episode 30/100, Total Reward: 5
Episode 31/100, Total Reward: 7
Episode