In [19]:
%matplotlib inline
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
import numpy as np

import gym
from gym import spaces

import pygame

class RCMazeEnv(gym.Env):
    def __init__(self, maze_size_x=10, maze_size_y=10):
        self.maze_size_x = maze_size_x
        self.maze_size_y = maze_size_y
        self.maze = self.generate_maze()
        self.car_position = (0, 0)
        self.possible_actions = range(3)
        self.car_orientation = 'N'
        self.sensor_readings = {'front': 0, 'left': 0, 'right': 0}
        self.steps = 0
        self.previous_distance = 0
        self.reset()

    def generate_maze(self):
        # For simplicity, create a static maze with walls
        # '1' represents a wall, and '0' represents an open path
        maze = np.zeros((self.maze_size_y, self.maze_size_x), dtype=int)
        # Add walls to the maze (this can be customized)
        maze[1::2, :] = 1
        maze[:, 1::2] = 0
        # Add goal
        maze[-1, -1] = 0
        
        return maze

    def reset(self):
        self.car_position = (0, 0)
        self.car_orientation = 'N'
        self.update_sensor_readings()
        self.steps = 0
        self.previous_distance = 0
        return self.get_state()

    def step(self, action):
        if action == 0:
            self.move_forward()
        elif action == 1:
            self.turn_left()
        elif action == 2:
            self.turn_right()
        self.update_sensor_readings()
        reward = self.compute_reward()
        self.steps += 1
        done = self.is_done()
        return self.get_state(), reward, done

    # def move_forward(self):
    #     x, y = self.car_position
    #     if self.car_orientation == 'N':
    #         self.car_position = (x, max(y - 1, 0))
    #     elif self.car_orientation == 'S':
    #         self.car_position = (x, min(y + 1, self.maze_size_y - 1))
    #     elif self.car_orientation == 'E':
    #         self.car_position = (min(x + 1, self.maze_size_x - 1), y)
    #     elif self.car_orientation == 'W':
    #         self.car_position = (max(x - 1, 0), y)
    
    def move_forward(self):
        x, y = self.car_position
        if self.car_orientation == 'N' and y > 0 and self.maze[y - 1][x] != 1:
            self.car_position = (x, y - 1)
        elif self.car_orientation == 'S' and y < self.maze_size_y - 1 and self.maze[y + 1][x] != 1:
            self.car_position = (x, y + 1)
        elif self.car_orientation == 'E' and x < self.maze_size_x - 1 and self.maze[y][x + 1] != 1:
            self.car_position = (x + 1, y)
        elif self.car_orientation == 'W' and x > 0 and self.maze[y][x - 1] != 1:
            self.car_position = (x - 1, y)
        

    def turn_left(self):
        orientations = ['N', 'W', 'S', 'E']
        idx = orientations.index(self.car_orientation)
        self.car_orientation = orientations[(idx + 1) % 4]

    def turn_right(self):
        orientations = ['N', 'E', 'S', 'W']
        idx = orientations.index(self.car_orientation)
        self.car_orientation = orientations[(idx + 1) % 4]

    def update_sensor_readings(self):
        # Simple sensor implementation: counts steps to the nearest wall
        self.sensor_readings['front'] = self.distance_to_wall('front')
        self.sensor_readings['left'] = self.distance_to_wall('left')
        self.sensor_readings['right'] = self.distance_to_wall('right')

    def distance_to_wall(self, direction):
        x, y = self.car_position
        distance = 0
        if direction == 'front':
            if self.car_orientation == 'N':
                while y - distance >= 0 and self.maze[y - distance][x] != 1:
                    distance += 1
            # Similar logic for other orientations...
        # Implement for left and right...
        return distance

    def compute_reward(self):
        reward = 0

        # Penalty for hitting walls or going out of bounds
        if self.sensor_readings['front'] == 0 or self.sensor_readings['left'] == 0 or self.sensor_readings['right'] == 0:
            reward -= 5

        # Reward for reaching the goal
        if self.car_position == (self.maze_size_x - 1, self.maze_size_y - 1):
            reward += 100
            return reward  # Return immediately as this is the terminal state

        # Calculate reward based on reduced distance to goal
        x, y = self.car_position
        goal_x, goal_y = (self.maze_size_x - 1, self.maze_size_y - 1)
        distance = abs(x - goal_x) + abs(y - goal_y)
        

        # Assuming previous_distance is stored after each move
        if distance < self.previous_distance:
            reward += 10  # Positive reward for moving closer to the goal
        elif distance > self.previous_distance:
            reward -= 5   # Negative reward for moving farther from the goal

        # Update previous_distance for the next step
        self.previous_distance = distance

        return reward

        

    def is_done(self):
        # Define when the episode ends
        # ends when the car reaches the goal or it takes more than 100 steps
        return self.car_position == (self.maze_size_x - 1, self.maze_size_y - 1)
        
        
    def get_state(self):
        return (self.car_position, self.car_orientation, self.sensor_readings)

    # def render(self):
    #     rendered_maze = np.array(self.maze, dtype=str)
    #     x, y = self.car_position
    #     rendered_maze[y][x] = 'C'  # Representing the car
        
    #     #print array
    #     print(rendered_maze, '\n') 

    
    def init_pygame(self):
        # Initialize Pygame and set up the display
        pygame.init()
        self.cell_size = 40  # Size of each cell in pixels
        self.width = self.maze_size_x * self.cell_size
        self.height = self.maze_size_y * self.cell_size
        self.screen = pygame.display.set_mode((self.width, self.height))
        self.clock = pygame.time.Clock()

    def render(self):
        # Render the environment using Pygame
        for y in range(self.maze_size_y):
            for x in range(self.maze_size_x):
                rect = pygame.Rect(x * self.cell_size, y * self.cell_size, self.cell_size, self.cell_size)
                if (x, y) == (self.maze_size_x - 1, self.maze_size_y - 1):  # Goal position
                    color = (0, 255, 0)  # Green color for the goal
                elif self.maze[y][x] == 0:
                    color = (255, 255, 255)  # White color for empty space
                else:
                    color = (0, 0, 0)  # Black color for walls
                pygame.draw.rect(self.screen, color, rect)

        # Draw the car
        car_x, car_y = self.car_position
        car_rect = pygame.Rect(car_x * self.cell_size, car_y * self.cell_size, self.cell_size, self.cell_size)
        pygame.draw.rect(self.screen, (255, 0, 0), car_rect)  # Red color for the car

        pygame.display.flip()
        self.clock.tick(60)  # Limit the frame rate to 60 FPS


    def close_pygame(self):
        # Close the Pygame window
        pygame.quit()

In [21]:
class QAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1, possible_actions=3, min_epsilon=0.01, epsilon_decay=0.99):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.possible_actions = possible_actions
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        

    def state_to_tuple(self, state):
        
        #((0, 0), 'N', {'front': 1, 'left': 0, 'right': 0})
        # if like this convert to ((0, 0), 'N', (1, 0, 0))
        if not isinstance(state[2], dict):
            # print(state)
            # print(state[2])
            #take state[2] and make it from this (1, 0, 0) to this {'front': 1, 'left': 0, 'right': 0}
            newState = {'front': state[2][0], 'left': state[2][1], 'right': state[2][2]}
            # print(newState)
            #create a new state with the [2] being the new dictionary
            state = (state[0], state[1], newState)
            
        # Convert the state dictionary to a hashable tuple
        # Adjust this based on the specific format of your state
        position, orientation, sensor_readings = state
        sensor_readings_tuple = tuple(sensor_readings.values())
        return (position, orientation, sensor_readings_tuple)

    def get_q_value(self, state, action):
        state_tuple = self.state_to_tuple(state)
        return self.q_table.get((state_tuple, action), 0)

    def choose_action(self, state):
        if random.random() < self.epsilon:
            return random.choice(range(self.possible_actions))
        else:
            state_tuple = self.state_to_tuple(state)
            q_values = [self.get_q_value(state_tuple, action) for action in range(self.possible_actions)]
            max_q = max(q_values)
            actions_with_max_q = [action for action, q in enumerate(q_values) if q == max_q]
            return random.choice(actions_with_max_q)

    def update_q_value(self, state, action, reward, next_state):
        state_tuple = self.state_to_tuple(state)
        next_state_tuple = self.state_to_tuple(next_state)
        max_q_next = max([self.get_q_value(next_state_tuple, next_action) for next_action in range(self.possible_actions)])
        current_q = self.get_q_value(state_tuple, action)
        new_q = current_q + self.alpha * (reward + self.gamma * max_q_next - current_q)
        self.q_table[(state_tuple, action)] = new_q

    def train(self, environment, num_episodes):
        reward_history = []
        for _ in range(num_episodes):
            state = environment.reset()
            done = False
            total_reward = 0
            while not done:
                
                action = self.choose_action(state)
                next_state, reward, done = environment.step(action)
                self.update_q_value(state, action, reward, next_state)
                total_reward += reward
                state = next_state

            # Add the total reward for this episode to the history
            reward_history.append(total_reward)

            # Decay epsilon, but not below the minimum value
            self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

            # Print episode summary
            print("Episode finished after {} timesteps".format(environment.steps))
            print("Total reward: {}, Epsilon: {:.3f}".format(total_reward, self.epsilon))

        return reward_history
            
    def test(self, env):
        state = env.reset()
        done = False
        total_reward = 0

        while not done:
            env.render()
            action = self.choose_action(state)
            next_state, reward, done = env.step(action)
            total_reward += reward
            state = next_state
            

        print(f"Test Total Reward: {total_reward}")

# Example usage:
# env = RCMazeEnv()
# agent = QAgent()
# agent.train(env, 1000)


In [27]:
EPSILON = 0.9
ALPHA = 0.1
GAMMA = 0.6
DECAY = 0.999
MINEPSILON = 0.1
DECAY_RATE = 0.993


env = RCMazeEnv()
agent = QAgent(alpha=ALPHA, gamma=GAMMA, epsilon=EPSILON, min_epsilon=MINEPSILON, epsilon_decay=DECAY_RATE)
env.init_pygame()
agent.train(env, 1000)
env.close_pygame()


Episode finished after 761 timesteps
Total reward: -3140, Epsilon: 0.894
Episode finished after 1292 timesteps
Total reward: -5570, Epsilon: 0.887
Episode finished after 683 timesteps
Total reward: -2835, Epsilon: 0.881
Episode finished after 312 timesteps
Total reward: -1150, Epsilon: 0.875
Episode finished after 428 timesteps
Total reward: -1695, Epsilon: 0.869
Episode finished after 261 timesteps
Total reward: -945, Epsilon: 0.863
Episode finished after 574 timesteps
Total reward: -2330, Epsilon: 0.857
Episode finished after 296 timesteps
Total reward: -1080, Epsilon: 0.851
Episode finished after 209 timesteps
Total reward: -725, Epsilon: 0.845
Episode finished after 243 timesteps
Total reward: -895, Epsilon: 0.839
Episode finished after 770 timesteps
Total reward: -3245, Epsilon: 0.833
Episode finished after 277 timesteps
Total reward: -1000, Epsilon: 0.827
Episode finished after 148 timesteps
Total reward: -440, Epsilon: 0.821
Episode finished after 384 timesteps
Total reward: -14

In [29]:
#test
env = RCMazeEnv()

env.init_pygame()

# Example of running the environment
agent.test(env)


env.close_pygame()

Test Total Reward: 135
