## Assignment 6 : Reinforcement Learning<br>
Implement Reinforcement Learning using an example of a maze environment that the
agent needs to explore.

In [None]:
import numpy as np
import random

# Define the Maze
class Maze:
    def __init__(self):
        self.grid = np.array([[0, 0, 0, 0, 0],
                               [0, 1, 1, 1, 0],
                               [0, 1, 0, 0, 0],
                               [0, 1, 1, 1, 0],
                               [0, 0, 0, 0, 2]])
        self.start_state = (0, 0)  # Starting position
        self.goal_state = (4, 4)    # Goal position
        self.state = self.start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        x, y = self.state

        if action == 0:  # Up
            x = max(0, x - 1)
        elif action == 1:  # Down
            x = min(4, x + 1)
        elif action == 2:  # Left
            y = max(0, y - 1)
        elif action == 3:  # Right
            y = min(4, y + 1)

        if self.grid[x, y] == 1:  # If hitting a wall
            return self.state, -1, False  # Return current state, penalty, and not done

        self.state = (x, y)

        if self.state == self.goal_state:  # Reached the goal
            return self.state, 10, True  # Return goal state, reward, and done
        else:
            return self.state, -0.1, False  # Penalty for each step taken

    def render(self):
        maze_copy = self.grid.copy()
        x, y = self.state
        maze_copy[x, y] = 3  # Mark the agent's position
        print(maze_copy)


In [None]:
class QLearningAgent:
    def __init__(self, maze):
        self.maze = maze
        self.q_table = np.zeros((5, 5, 4))  # 5x5 grid and 4 actions
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.99
        self.min_epsilon = 0.1
        self.num_episodes = 1000

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, 3)  # Explore: choose random action
        else:
            return np.argmax(self.q_table[state[0], state[1]])  # Exploit: choose best action

    def learn(self):
        for episode in range(self.num_episodes):
            state = self.maze.reset()
            done = False

            while not done:
                action = self.choose_action(state)
                next_state, reward, done = self.maze.step(action)

                # Update Q-table using the Q-learning formula
                best_next_action = np.argmax(self.q_table[next_state[0], next_state[1]])
                td_target = reward + self.discount_factor * self.q_table[next_state[0], next_state[1], best_next_action]
                td_delta = td_target - self.q_table[state[0], state[1], action]
                self.q_table[state[0], state[1], action] += self.learning_rate * td_delta

                state = next_state

            # Decay epsilon
            if self.epsilon > self.min_epsilon:
                self.epsilon *= self.epsilon_decay

    def print_q_table(self):
        print(self.q_table)


In [None]:
if __name__ == "__main__":
    maze = Maze()
    agent = QLearningAgent(maze)
    agent.learn()
    agent.print_q_table()


[[[ 3.67891766  4.2612659   3.70524496  2.93099989]
  [ 0.25911092 -1.13448552  3.66160516 -0.24015041]
  [-0.23742012 -1.09635977 -0.28128027 -0.07940785]
  [-0.15046056 -1.06099571 -0.23713696  0.25047185]
  [-0.03695421  1.05294566 -0.16423133  0.04985236]]

 [[ 3.38950147  4.845851    4.12813274  3.32961286]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [-0.06086446  2.665014   -0.45584193  0.12069716]]

 [[ 4.20683011  5.49539     4.61224376  3.73097334]
  [ 0.          0.          0.          0.        ]
  [-0.19173019 -0.27273019 -0.19       -0.01802881]
  [-0.1        -0.19022438 -0.02883019  0.47994892]
  [ 0.30372228  5.22939365 -0.03048037  1.32718924]]

 [[ 4.61724753  6.2171      5.41306523  4.53426366]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 1.