In [49]:
import numpy as np
import random
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import utils
from collections import deque

utils.disable_interactive_logging()

### Step 1: Define the DQNAgent class with Neural Network

In [50]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=10000)
        self.memory_len = 0
        self.gamma = 0.99  # Discount factor for future rewards
        self.epsilon = 1.0  # Exploration rate, start with full exploration
        self.epsilon_min = 0.01  # Minimum exploration rate
        # self.epsilon_decay = 0.995  # Exploration rate decay
        self.learning_rate = 0.001  # Learning rate for the neural network
        self.target_update_frequency = 100
        self.weight_update_frequency = 20
        self.model = self._build_model()
        self.target_model = self._build_model()

    def _build_model(self):
        # Neural Network with two fully connected layers
        model = Sequential()
        model.add(Dense(32, input_shape=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        # Update the target network's weights with the main network's weights
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        # Store the experience in the replay memory
        self.memory.append((state, action, reward, next_state, done))
        self.memory_len += 1

    def act(self, state):
        # Epsilon-greedy policy to choose the action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])
    
    def epsilon_decay(self, episode, max_episodes):
        epsilon_t = 1 - 0.7 * episode / max_episodes
        return epsilon_t

    def replay(self, batch_size):
        # Experience replay to train the network
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.target_model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

### Step 2: Create the GridWorld environment and function for 2D grid representation

In [51]:
class GridWorld:
    def __init__(self, grid_size, walls, start_position, goal_position):
        self.grid_size = grid_size
        self.walls = walls
        self.start_position = start_position
        self.goal_position = goal_position
        self.agent_position = start_position
        self.prev_position = start_position
        self.done = False

    def reset(self):
        self.agent_position = self.start_position
        self.prev_position = self.start_position
        self.done = False
        return self.get_state()

    def get_state(self):
        state = np.zeros((self.grid_size, self.grid_size))
        state[self.agent_position[0], self.agent_position[1]] = 1
        state[self.goal_position[0], self.goal_position[1]] = 2
        for wall in self.walls:
            state[wall[0], wall[1]] = -1
        return state

    def step(self, action):
        if self.done:
            raise ValueError("Cannot step in a terminal state.")

        actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        move = actions[action]
        new_position = (self.agent_position[0] + move[0], self.agent_position[1] + move[1])
        step_reward = 0

        if new_position[0] < 0 or new_position[0] >= self.grid_size or new_position[1] < 0 or new_position[1] >= self.grid_size or new_position in self.walls:
            self.done = True
            step_reward = -10
            return self.get_state(), step_reward, self.done
        
        self.prev_position = self.agent_position
        self.agent_position = new_position
        if self.agent_position == self.prev_position:
            step_reward = -2
        else:
            step_reward = 2

        if self.agent_position == self.goal_position:
            self.done = True
            step_reward = 10
            return self.get_state(), step_reward, self.done

        return self.get_state(), step_reward, self.done

### Step 3: Initialize the GridWorld environment and agent

In [52]:
grid_size = 5
walls = [(1, 3), (1, 2), (2, 3), (3, 3), (2, 0), (2, 1)]  # Adding walls to the grid
start_position = (0, 0)
goal_position = (4, 4)

# The state size now represents the dimensions of the grid (grid_size x grid_size)
state_size = (grid_size, grid_size)
action_size = 4

env = GridWorld(grid_size, walls, start_position, goal_position)
agent = DQNAgent(state_size, action_size)

### Step 4: Train the DQN

In [53]:
def train_agent(num_episodes=1000, max_steps_per_episode=500, batch_size=32):
    print("Training started...")

    total_rewards_collected = []
    for episode in range(1, num_episodes+1):
        state = env.reset()
        done = False
        total_reward = 0
        step_count = 0
        while not done:    
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        print('Episode:',episode, ': ',total_reward)
        print('Agent Position: ',env.agent_position)

        if env.agent_position == env.goal_position:
            print('**************Reached the Goal!******************')
                    
        if episode % agent.weight_update_frequency == 0 and agent.memory_len > batch_size:
            agent.replay(batch_size)
        
        if episode % agent.target_update_frequency == 0:
            agent.update_target_model()

        if agent.epsilon > agent.epsilon_min:
            agent.epsilon = agent.epsilon_decay(episode, num_episodes)
        
        # if episode % 50 == 0:
        #     print('Average rewards collected in interval (',episode-49,'-', episode,') : ', sum(total_rewards_collected[episode-50:episode])//50)

    print("Training Completed!")

### Step 5: Evaluate the trained agent

In [54]:
def test_agent(num_episodes=10, max_steps_per_episode=1000):
    print("Testing started...")

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        step_count = 0
        
        while step_count < max_steps_per_episode:
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            state = next_state
            total_reward += reward
            step_count += 1

            if done:
                print(env.agent_position)
                if env.agent_position == env.goal_position:
                    print('**************Reached the Goal!******************')
                else:
                    print('xxxxxxxxxxxxxxxxxxxx Hit a wall xxxxxxxxxxxxxxxxxxxx')
                break

        print(f"Evaluation Episode: {episode + 1}, Reward: {total_reward}")
    
    print("Finished testing!")

### Final Training and Testing

In [55]:
num_episodes_training = 2000
num_episodes_testing = 3

train_agent(num_episodes_training)

Training started...
Episode: 1 :  -10
Agent Position:  (0, 0)
Episode: 2 :  -8
Agent Position:  (1, 0)
Episode: 3 :  -10
Agent Position:  (0, 0)
Episode: 4 :  -10
Agent Position:  (0, 0)
Episode: 5 :  -10
Agent Position:  (0, 0)
Episode: 6 :  -6
Agent Position:  (1, 1)
Episode: 7 :  -8
Agent Position:  (0, 1)
Episode: 8 :  -6
Agent Position:  (1, 1)
Episode: 9 :  -8
Agent Position:  (1, 0)
Episode: 10 :  -10
Agent Position:  (0, 0)
Episode: 11 :  -10
Agent Position:  (0, 0)
Episode: 12 :  -8
Agent Position:  (1, 0)
Episode: 13 :  -10
Agent Position:  (0, 0)
Episode: 14 :  -10
Agent Position:  (0, 0)
Episode: 15 :  -10
Agent Position:  (0, 0)
Episode: 16 :  -10
Agent Position:  (0, 0)
Episode: 17 :  -6
Agent Position:  (1, 1)
Episode: 18 :  -8
Agent Position:  (0, 1)
Episode: 19 :  -10
Agent Position:  (0, 0)
Episode: 20 :  -10
Agent Position:  (0, 0)
Episode: 21 :  -10
Agent Position:  (0, 0)
Episode: 22 :  -10
Agent Position:  (0, 0)
Episode: 23 :  -10
Agent Position:  (0, 0)
Episode:

KeyboardInterrupt: 

In [None]:
# agent.epsilon = 0
# test_agent(num_episodes_testing)

In [None]:
explore_count = 0
exploit_count = 0
for episode in range(10000):
    if random.uniform(0, 1) <= agent.epsilon_decay(episode, 10000):
        explore_count += 1
    else:
        exploit_count += 1
print("Explored: ",explore_count)
print("Exploit: ", exploit_count)