In [27]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
import gym
import matplotlib.pyplot as plt
import time

In [28]:
# Hyperparameters
n_episodes = 100
batch_size = 10
gamma = 0.95
learning_rate = 0.1
epsilon_decay = 0.09
n_actions = 4  # Number of actions in FrozenLake

In [29]:
negative_reward_enabled = False
is_slippery_enabled = False

In [30]:
custom_map = [
    'SFFF',
    'FHFF',
    'FFHF',
    'HFGF'
]

In [31]:
# Initialize the environment
env = gym.make('FrozenLake-v1', desc=custom_map, is_slippery=is_slippery_enabled)
n_states = env.observation_space.n

# Create the Q-network model
model = keras.Sequential([
    keras.layers.InputLayer(batch_input_shape=(1, n_states)),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(n_actions, activation='linear')
])
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = keras.losses.mean_squared_error

In [32]:
model.compile(optimizer='adam', loss='mean_squared_error')

def one_hot(state, state_space=16):
    return np.identity(state_space)[state:state + 1].reshape(1, -1)

total_rewards = []
total_steps = []
success_rate = []

# Early stopping parameters
early_stopping_enabled = True
early_stopping_threshold = 0.75  # 75% average reward
early_stopping_check_every = 100  # check every 100 episodes
early_stopping_patience = 100  # patience counter (you may not need this depending on your early stopping criteria)

start_time = time.time()

episodes_to_train = n_episodes

for episode in tqdm(range(n_episodes)):
    state = env.reset()
    state = one_hot(state)  # One-hot encode the initial state
    done = False
    episode_reward = 0
    episode_steps = 0

    while not done:
        # Epsilon-greedy action selection
        epsilon = max(1 - episode * epsilon_decay / (n_episodes - 1), 0.01)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state)
            action = np.argmax(q_values[0])

        # Take action
        next_state, reward, done, _ = env.step(action)
        next_state = one_hot(next_state)  # One-hot encode the next state

        # Modify reward for the hole case if needed
        if done and reward == 0:
            reward = -1

        episode_reward += reward
        episode_steps += 1

        # Store in replay buffer and train
        target = reward if done else reward + gamma * np.max(model.predict(next_state)[0])
        target_f = model.predict(state)
        target_f[0][action] = target
        model.fit(state, target_f, epochs=1, verbose=0)

        # Update state
        state = next_state

    total_rewards.append(episode_reward)
    total_steps.append(episode_steps)
    success_rate.append(int(episode_reward > 0))

    # Print metrics every 100 episodes
    if (episode + 1) % 100 == 0:
        average_reward = np.mean(total_rewards[-100:])
        average_steps = np.mean(total_steps[-100:])
        success_percentage = np.mean(success_rate[-100:])
        print(f"Episode: {episode + 1}, Avg Reward: {average_reward}, Avg Steps: {average_steps}, Success Rate: {success_percentage * 100}%")

    # Early stopping check
    if early_stopping_enabled and (episode + 1) % early_stopping_check_every == 0:
        average_reward = np.mean(total_rewards[-early_stopping_check_every:])
        success_percentage = np.mean(success_rate[-early_stopping_check_every:])
        print(f"Episode: {episode + 1}, Avg Reward: {average_reward}, Success Rate: {success_percentage * 100}%")
        
        # Check if average reward is greater than the early stopping threshold
        if average_reward >= early_stopping_threshold:
            print(f"Early stopping triggered at episode {episode + 1}.")
            episodes_to_train = episode + 1
            break  # Break out of the loop to stop training

# End training timer
end_time = time.time()
training_time = end_time - start_time


  0%|          | 0/100 [00:00<?, ?it/s]



  1%|          | 1/100 [00:02<04:22,  2.65s/it]



  2%|▏         | 2/100 [00:06<05:35,  3.42s/it]



  3%|▎         | 3/100 [00:10<05:31,  3.42s/it]



  4%|▍         | 4/100 [00:12<05:08,  3.22s/it]



  5%|▌         | 5/100 [00:17<05:52,  3.72s/it]



  6%|▌         | 6/100 [00:20<05:19,  3.40s/it]



  7%|▋         | 7/100 [00:20<03:42,  2.39s/it]



  8%|▊         | 8/100 [00:21<02:44,  1.79s/it]



  9%|▉         | 9/100 [00:22<02:44,  1.81s/it]



 10%|█         | 10/100 [00:25<03:12,  2.14s/it]



 11%|█         | 11/100 [00:28<03:27,  2.33s/it]



 12%|█▏        | 12/100 [00:31<03:37,  2.48s/it]



 13%|█▎        | 13/100 [00:32<03:05,  2.14s/it]



 14%|█▍        | 14/100 [00:33<02:24,  1.68s/it]



 15%|█▌        | 15/100 [00:34<01:59,  1.40s/it]



 16%|█▌        | 16/100 [00:35<01:45,  1.26s/it]



 17%|█▋        | 17/100 [00:35<01:33,  1.13s/it]



 18%|█▊        | 18/100 [00:36<01:22,  1.00s/it]



 19%|█▉        | 19/100 [00:37<01:16,  1.06it/s]



 20%|██        | 20/100 [00:38<01:21,  1.02s/it]



 21%|██        | 21/100 [00:39<01:18,  1.01it/s]



 22%|██▏       | 22/100 [00:39<01:02,  1.26it/s]



 23%|██▎       | 23/100 [00:41<01:10,  1.09it/s]



 24%|██▍       | 24/100 [00:42<01:10,  1.08it/s]



 25%|██▌       | 25/100 [00:42<01:00,  1.24it/s]



 26%|██▌       | 26/100 [00:43<01:03,  1.16it/s]



 27%|██▋       | 27/100 [00:44<01:00,  1.21it/s]



 28%|██▊       | 28/100 [00:46<01:34,  1.32s/it]



 29%|██▉       | 29/100 [00:48<01:38,  1.39s/it]



 30%|███       | 30/100 [00:50<01:52,  1.61s/it]



 31%|███       | 31/100 [00:50<01:25,  1.25s/it]



 32%|███▏      | 32/100 [00:51<01:21,  1.20s/it]



 33%|███▎      | 33/100 [00:54<01:48,  1.62s/it]



 34%|███▍      | 34/100 [00:55<01:37,  1.47s/it]



 35%|███▌      | 35/100 [00:57<01:37,  1.49s/it]



 36%|███▌      | 36/100 [01:00<02:18,  2.17s/it]



 37%|███▋      | 37/100 [01:08<04:05,  3.90s/it]



 38%|███▊      | 38/100 [01:09<02:58,  2.87s/it]



 39%|███▉      | 39/100 [01:11<02:46,  2.72s/it]



 40%|████      | 40/100 [01:12<02:14,  2.25s/it]



 41%|████      | 41/100 [01:15<02:15,  2.30s/it]



 42%|████▏     | 42/100 [01:16<01:47,  1.85s/it]



 43%|████▎     | 43/100 [01:17<01:42,  1.80s/it]



 44%|████▍     | 44/100 [01:21<02:09,  2.30s/it]



 45%|████▌     | 45/100 [01:21<01:39,  1.81s/it]



 46%|████▌     | 46/100 [01:23<01:40,  1.86s/it]



 47%|████▋     | 47/100 [01:24<01:15,  1.42s/it]



 48%|████▊     | 48/100 [01:25<01:02,  1.20s/it]



 49%|████▉     | 49/100 [01:26<00:59,  1.17s/it]



 50%|█████     | 50/100 [01:27<01:00,  1.22s/it]



 51%|█████     | 51/100 [01:29<01:16,  1.56s/it]



 52%|█████▏    | 52/100 [01:30<01:08,  1.43s/it]



 53%|█████▎    | 53/100 [01:32<01:06,  1.42s/it]



 54%|█████▍    | 54/100 [01:40<02:36,  3.40s/it]



 55%|█████▌    | 55/100 [01:42<02:14,  2.98s/it]



 56%|█████▌    | 56/100 [01:44<01:55,  2.62s/it]



 57%|█████▋    | 57/100 [01:44<01:27,  2.03s/it]



 58%|█████▊    | 58/100 [01:45<01:06,  1.58s/it]



 59%|█████▉    | 59/100 [01:48<01:28,  2.15s/it]



 60%|██████    | 60/100 [01:53<01:52,  2.81s/it]



 61%|██████    | 61/100 [01:55<01:47,  2.76s/it]



 62%|██████▏   | 62/100 [02:00<02:01,  3.21s/it]



 63%|██████▎   | 63/100 [02:05<02:25,  3.94s/it]



 64%|██████▍   | 64/100 [02:06<01:47,  2.98s/it]



 65%|██████▌   | 65/100 [02:07<01:23,  2.38s/it]



 66%|██████▌   | 66/100 [02:12<01:43,  3.06s/it]



 67%|██████▋   | 67/100 [02:13<01:21,  2.47s/it]



 68%|██████▊   | 68/100 [02:13<00:59,  1.86s/it]



 69%|██████▉   | 69/100 [02:13<00:42,  1.39s/it]



 70%|███████   | 70/100 [02:15<00:41,  1.38s/it]



 71%|███████   | 71/100 [02:15<00:31,  1.10s/it]



 72%|███████▏  | 72/100 [02:19<00:53,  1.89s/it]



 73%|███████▎  | 73/100 [02:20<00:48,  1.79s/it]



 74%|███████▍  | 74/100 [02:21<00:36,  1.40s/it]



 75%|███████▌  | 75/100 [02:25<00:56,  2.25s/it]



 76%|███████▌  | 76/100 [02:26<00:41,  1.71s/it]



 77%|███████▋  | 77/100 [02:26<00:29,  1.28s/it]



 78%|███████▊  | 78/100 [02:31<00:53,  2.45s/it]



 79%|███████▉  | 79/100 [02:33<00:47,  2.27s/it]



 80%|████████  | 80/100 [02:34<00:36,  1.82s/it]



 81%|████████  | 81/100 [02:34<00:27,  1.47s/it]



 82%|████████▏ | 82/100 [02:43<01:05,  3.66s/it]



 83%|████████▎ | 83/100 [02:46<00:55,  3.27s/it]



 84%|████████▍ | 84/100 [02:47<00:44,  2.81s/it]



 85%|████████▌ | 85/100 [02:48<00:34,  2.31s/it]



 86%|████████▌ | 86/100 [02:52<00:37,  2.70s/it]



 87%|████████▋ | 87/100 [03:01<00:57,  4.45s/it]



 88%|████████▊ | 88/100 [03:01<00:40,  3.34s/it]



 89%|████████▉ | 89/100 [03:06<00:41,  3.75s/it]



 90%|█████████ | 90/100 [03:06<00:27,  2.76s/it]



 91%|█████████ | 91/100 [03:08<00:21,  2.36s/it]



 92%|█████████▏| 92/100 [03:08<00:13,  1.74s/it]



 93%|█████████▎| 93/100 [03:15<00:21,  3.13s/it]



 94%|█████████▍| 94/100 [03:15<00:13,  2.32s/it]



 95%|█████████▌| 95/100 [03:17<00:11,  2.26s/it]



 96%|█████████▌| 96/100 [03:18<00:07,  1.85s/it]



 97%|█████████▋| 97/100 [03:21<00:06,  2.19s/it]



 98%|█████████▊| 98/100 [03:23<00:04,  2.22s/it]



 99%|█████████▉| 99/100 [03:24<00:01,  1.78s/it]



100%|██████████| 100/100 [03:32<00:00,  2.12s/it]

Episode: 100, Avg Reward: -0.98, Avg Steps: 8.13, Success Rate: 1.0%
Episode: 100, Avg Reward: -0.98, Success Rate: 1.0%





In [15]:
# Post training metrics
overall_average_reward = np.mean(total_rewards)
overall_average_steps = np.mean(total_steps)
overall_success_rate = np.mean(success_rate)

print('----------------------------------------------------------')
print("Overall Average reward:", overall_average_reward)
print("Overall Average number of steps:", overall_average_steps)
print("Success rate (%):", overall_success_rate * 100)
print('----------------------------------------------------------')

----------------------------------------------------------
Overall Average reward: -0.96
Overall Average number of steps: 9.81
Success rate (%): 2.0
----------------------------------------------------------


In [19]:
def test_dqn_policy(env, model, num_tests=100):
    success_count = 0
    step_list = []  # To store the number of steps for successful episodes

    for _ in range(num_tests):
        state = env.reset()
        state = one_hot(state)  # Apply one-hot encoding to the initial state
        done = False
        step_count = 0
        
        while not done:
            q_values = model.predict(state)
            action = np.argmax(q_values[0])
            
            next_state, reward, done, _ = env.step(action)
            next_state = one_hot(next_state)  # Apply one-hot encoding to the next state
            step_count += 1
            
            # Assuming reward of 1 or higher indicates success
            if done and reward >= 1:
                success_count += 1
                step_list.append(step_count)
            
            state = next_state  # Update the state

    success_rate = success_count / num_tests
    average_steps_when_winning = np.mean(step_list) if step_list else float('inf')

    return success_rate, average_steps_when_winning

# After training your DQN model, you can call this function to evaluate it.
post_training_success_rate, average_steps_when_winning = test_dqn_policy(env, model)




KeyboardInterrupt: 

In [None]:
print('')
print('Q-Learning with Negative reward: ', negative_reward_enabled, '; and slippery: ', is_slippery_enabled)
print('==========================================================')
print('The number of episodes', episodes_to_train)
print('Post-Training Success rate (%):', post_training_success_rate * 100)
print('Average number of steps when winning:', average_steps_when_winning)
print('Training Time (seconds):', training_time)
print('==========================================================')

In [None]:
# Plotting metrics
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].plot(total_rewards, 'tab:green')
axs[0].set_title('Reward per Episode')
axs[1].plot(total_steps, 'tab:purple')
axs[1].set_title('Steps per Episode')
plt.show()