In [19]:
import numpy as np
from matplotlib import pyplot as plt
import random

Part 2

In [20]:
class MazeEnvironment:
    def __init__(self):
    # Define the maze layout, rewards, action space (up, down, left, right)
        self.start_pos = (0,0) # Start position of the agent
        self.current_pos = self.start_pos
        self.state_penalty = -1
        self.trap_penalty = -100
        self.goal_reward = 100
        self.actions = {0:(-1, 0), 1:(1, 0), 2:(0, -1), 3:(0, 1)}
        self.maze = np.array([[0,1,1,1,1,1,1,0,0,1,1],
                                    [0,0,0,0,1,1,0,2,0,0,1],
                                    [0,1,1,0,0,0,0,1,0,1,1],
                                    [0,1,1,0,1,1,1,1,0,0,0],
                                    [0,0,1,0,0,0,1,1,0,1,0],
                                    [0,1,1,1,1,0,0,1,0,1,3],
                                    [0,0,0,0,0,2,1,0,0,0,1],
                                    [1,0,1,0,1,0,0,0,1,1,0],
                                    [1,0,1,1,1,1,1,0,0,1,0],
                                    [1,0,0,0,0,1,1,1,0,1,0],
                                    [1,1,1,1,0,0,0,0,0,0,0]])
        self.number_rows, self.number_columns = self.maze.shape
    def reset(self):
        self.current_pos = self.start_pos
        
    def step(self, action):
        prob = random.random()
        if prob < 0.75:
            new_move = self.actions[action]
        elif prob < 0.80:
            new_move = (-self.actions[action][0],-self.actions[action][1])
        elif prob < 0.90:
            new_move = (self.actions[action][1], self.actions[action][0])
        else:
            new_move = (-self.actions[action][1], -self.actions[action][0])
        new_position = (self.current_pos[0] + new_move[0] , self.current_pos[1] + new_move[1])
        
        done = False
        reward = 0
        
        # check the boundaries & obstacles
        if new_position[0] > self.number_rows-1 or new_position[0] < 0 or new_position[1] > self.number_columns-1 or new_position[1] < 0:
            self.current_pos = self.current_pos
            reward = -1
        elif self.maze[new_position] == 1:
            self.current_pos = self.current_pos
            reward = -1
        else:
            self.current_pos = new_position
            if self.maze[self.current_pos] == 0:
                reward = 0
                done = False
            elif self.maze[self.current_pos] == 1:
                print("error the position must not be an obstacle")
            elif self.maze[self.current_pos] == 2:
                
                reward = -100
                done = True
            elif self.maze[self.current_pos] == 3:
                reward = 100
                done = True
                
        return reward, done
        

In [38]:
import os
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

def plot_value_function(value_function, maze, alpha, gamma, epsilon, num_episode, method):
    mask = np.zeros_like(value_function, dtype=bool)
    mask[maze == 1] = True  # Mask obstacles
    mask[maze == 2] = True  # Mask the trap
    mask[maze == 3] = True  # Mask the goal

    trap_position = tuple(np.array(np.where(maze == 2)).transpose(1, 0))
    goal_position = np.where(maze == 3)
    obs_position = tuple(np.array(np.where(maze == 1)).transpose(1, 0))

    plt.figure(figsize=(10, 10))
    cmap = LinearSegmentedColormap.from_list('rg', ["r", "w", "g"], N=256)
    ax = sns.heatmap(value_function, mask=mask, annot=True, fmt=".1f", cmap=cmap,
                     cbar=False, linewidths=1, linecolor='black')
    ax.add_patch(plt.Rectangle(goal_position[::-1], 1, 1, fill=True, edgecolor='black', facecolor='darkgreen'))
    for t in trap_position:
        ax.add_patch(plt.Rectangle(t[::-1], 1, 1, fill=True, edgecolor='black', facecolor='darkred'))
    for o in obs_position:
        ax.add_patch(plt.Rectangle(o[::-1], 1, 1, fill=True, edgecolor='black', facecolor='gray'))
    ax.set_title(f"Value Function for alpha={alpha}, gamma={gamma}, epsilon={epsilon}, episode={num_episode}, method={method}")

    folder_name = f"plots_alpha_{alpha}_gamma_{gamma}_epsilon_{epsilon}_method_{method}"
    os.makedirs(folder_name, exist_ok=True)
    plt.savefig(os.path.join(folder_name, f"episode_{num_episode}, value_function.png"))
    plt.close()

def plot_policy(value_function, maze, alpha, gamma, epsilon, num_episode, method):
    policy_arrows = {'up': '↑', 'down': '↓', 'left': '←', 'right': '→'}
    policy_grid = np.full(maze.shape, '', dtype='<U2')
    actions = ['up', 'down', 'left', 'right']

    trap_position = tuple(np.array(np.where(maze == 2)).transpose(1, 0))
    goal_position = np.where(maze == 3)
    obs_position = tuple(np.array(np.where(maze == 1)).transpose(1, 0))

    for i in range(maze.shape[0]):
        for j in range(maze.shape[1]):
            if maze[i][j] == 1 or (i, j) == goal_position:
                continue  # Skip obstacles and the goal
            best_action = None
            best_value = float('-inf')
            for action in actions:
                next_i, next_j = i, j
                if action == 'up':
                    next_i -= 1
                elif action == 'down':
                    next_i += 1
                elif action == 'left':
                    next_j -= 1
                elif action == 'right':
                    next_j += 1
                if 0 <= next_i < maze.shape[0] and 0 <= next_j < maze.shape[1]:
                    if value_function[next_i][next_j] > best_value:
                        best_value = value_function[next_i][next_j]
                        best_action = action
            if best_action:
                policy_grid[i][j] = policy_arrows[best_action]

    mask = np.zeros_like(value_function, dtype=bool)
    mask[maze == 1] = True  # Mask obstacles
    mask[maze == 2] = True  # Mask the trap
    mask[maze == 3] = True  # Mask the goal

    plt.figure(figsize=(10, 10))
    cmap = LinearSegmentedColormap.from_list('rg', ["r", "w", "g"], N=256)
    ax = sns.heatmap(value_function, mask=mask, annot=policy_grid, fmt="", cmap=cmap,
                     cbar=False, linewidths=1, linecolor='black')
    ax.add_patch(plt.Rectangle(goal_position[::-1], 1, 1, fill=True, edgecolor='black', facecolor='darkgreen'))
    for t in trap_position:
        ax.add_patch(plt.Rectangle(t[::-1], 1, 1, fill=True, edgecolor='black', facecolor='darkred'))
    for o in obs_position:
        ax.add_patch(plt.Rectangle(o[::-1], 1, 1, fill=True, edgecolor='black', facecolor='gray'))
    ax.set_title(f"Policy Map for alpha={alpha}, gamma={gamma}, epsilon={epsilon}, episode={num_episode}, method={method}")

    folder_name = f"plots_alpha_{alpha}_gamma_{gamma}_epsilon_{epsilon}_method_{method}"
    os.makedirs(folder_name, exist_ok=True)
    plt.savefig(os.path.join(folder_name, f"episode_{num_episode}, policy.png"))
    plt.close()


Part 3

In [39]:
class MazeTD0(MazeEnvironment):  # Inherits from MazeEnvironment
    def __init__(self, alpha=0.1, gamma=0.95, epsilon=0.2, episodes=10000):
        super().__init__()
        self.alpha = alpha  # Learning Rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration Rate
        self.episodes = episodes
        self.utility = np.zeros(self.maze.shape)  # Initialize utility values to zero

    def choose_action(self):
        if random.random() < self.epsilon:
            return random.choice(list(self.actions.keys()))
        else:
            utilities = []
            for action in self.actions.keys():
                
                reward, done = self.step(action)
                utilities.append(self.utility[self.current_pos] if not done else reward)
            return np.argmax(utilities)

    def update_utility_value(self):
        done = False
        while not done:
            current_state = self.current_pos
            current_value = self.utility[current_state]
            action = self.choose_action()
            reward, done = self.step(action)
            new_state = self.current_pos
            new_value = reward + self.gamma * self.utility[new_state]
            self.utility[current_state] += self.alpha * (new_value - current_value)

    def run_episodes(self):
        for j in range(1, self.episodes+1):
            self.reset()
            self.update_utility_value()
            if self.episodes == 10000:
                if j == 1 or j == 50 or j == 100 or j ==1000 or j ==  5000 or j == 10000:
                    plot_value_function(value_function=self.utility, maze=self.maze, alpha = self.alpha, gamma = self.gamma, epsilon=self.epsilon, num_episode=j, method = "Temporal Difference Learning")
                    plot_policy(value_function= self.utility, maze=self.maze, alpha = self.alpha, gamma = self.gamma, epsilon=self.epsilon, num_episode=j, method= "Temporal Difference Learning")
        return self.utility

In [None]:
td_agent = MazeTD0(alpha=0.1, gamma=0.95, epsilon=0.2, episodes=100000)
final_utility = td_agent.run_episodes()
plot_value_function(value_function=final_utility, maze=td_agent.maze, alpha = 0.1, gamma = 0.95, epsilon=0.2, num_episode=100000)
plot_policy(value_function=final_utility, maze=td_agent.maze, alpha = 0.1, gamma = 0.95, epsilon=0.2, num_episode=100000)

Part 4

In [40]:
class MazeQLearning(MazeEnvironment):  # Inherits from MazeEnvironment
    def __init__(self, alpha=0.1, gamma=0.95, epsilon=0.2, episodes=10000):
        super().__init__()
        self.alpha = alpha  # Learning Rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration Rate
        self.episodes = episodes
        self.q_table = np.zeros((self.number_rows, self.number_columns, len(self.actions)))  # Initialize Q-table

    def choose_action(self):
        state = self.current_pos
        if random.random() < self.epsilon:
            return random.choice(list(self.actions.keys()))
        else:
            return np.argmax(self.q_table[state[0], state[1], :])

    def update_q_table(self):
        done = False

        while not done:
            current_state = self.current_pos
            action = self.choose_action()
            reward, done = self.step(action)
            current_q = self.q_table[current_state[0], current_state[1], action]
            new_state = self.current_pos
            max_future_q = np.max(self.q_table[new_state[0], new_state[1], :])
            new_q = reward + self.gamma * max_future_q
            self.q_table[current_state[0], current_state[1], action] += self.alpha * (new_q - current_q)

    def run_episodes(self):
        for j in range(1,self.episodes+1):
            self.reset()
            self.update_q_table()
            if j == 1 or j == 50 or j == 100 or j ==1000 or j ==  5000 or j == 10000:
                plot_value_function(value_function=np.max(self.q_table, axis=2), maze=self.maze, alpha = self.alpha, gamma = self.gamma, epsilon=self.epsilon, num_episode=j, method='Q_learning')
                plot_policy(value_function= np.max(self.q_table, axis=2), maze=self.maze, alpha = self.alpha, gamma = self.gamma, epsilon=self.epsilon, num_episode=j, method= "Q_learning")
                
        return np.max(self.q_table, axis=2)




In [None]:
# Example usage
q_learning_agent = MazeQLearning(alpha=0.1, gamma=0.95, epsilon=0.2, episodes=10000)
final_q_table = q_learning_agent.run_episodes()

Part 5

In [42]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1]
for alpha in alpha_values:
    td_agent = MazeTD0(alpha=alpha, gamma=0.95, epsilon=0.2, episodes=10000)
    final_utility = td_agent.run_episodes()
    maze = td_agent.maze


In [43]:
gamma_values = [0.1, 0.25, 0.5, 0.75]
for gamma in gamma_values:
    td_agent = MazeTD0(alpha=0.1, gamma=gamma, epsilon=0.2, episodes=10000)
    final_utility = td_agent.run_episodes()
    maze = td_agent.maze
    

In [44]:
epsilon_values = [0, 0.5, 0.8, 1]
for epsilon in epsilon_values:
    td_agent = MazeTD0(alpha=0.1, gamma=0.95, epsilon=epsilon, episodes=10000)
    final_utility = td_agent.run_episodes()
    maze = td_agent.maze

In [45]:
alpha_values = [0.001, 0.01, 0.1, 0.5, 1]
for alpha in alpha_values:
    q_learning_agent = MazeQLearning(alpha=alpha, gamma=0.95, epsilon=0.2, episodes=10000)
    final_q_table = q_learning_agent.run_episodes()
    maze = q_learning_agent.maze

In [46]:
gamma_values = [0.1, 0.25, 0.5, 0.75]
for gamma in gamma_values:
    q_learning_agent = MazeQLearning(alpha=0.1, gamma=gamma, epsilon=0.2, episodes=10000)
    final_q_table = q_learning_agent.run_episodes()
    maze = q_learning_agent.maze

In [47]:
epsilon_values = [0, 0.5, 0.8, 1]
for epsilon in epsilon_values:
    q_learning_agent = MazeQLearning(alpha=0.1, gamma=0.95, epsilon=epsilon, episodes=10000)
    final_q_table = q_learning_agent.run_episodes()
    maze = q_learning_agent.maze
    