In [11]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm import tqdm
import gym
import matplotlib.pyplot as plt
import time

In [12]:
# Hyperparameters
n_episodes = 10000
batch_size = 10
gamma = 0.95
learning_rate = 0.1
epsilon_decay = 0.09
n_actions = 4  # Number of actions in FrozenLake

In [15]:
negative_reward_enabled = False
is_slippery_enabled = False

In [16]:
custom_map = [
    'SFFF',
    'FHFF',
    'FFHF',
    'HFGF'
]

In [13]:
# Initialize the environment
env = gym.make('FrozenLake-v1', desc=custom_map, is_slippery=is_slippery_enabled)
n_states = env.observation_space.n

# Create the Q-network model
model = keras.Sequential([
    keras.layers.InputLayer(batch_input_shape=(1, n_states)),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(n_actions, activation='linear')
])
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = keras.losses.mean_squared_error

In [14]:
# Metrics
total_rewards = []
total_steps = []
success_rate = []

# Function to run episodes and train the agent
for episode in tqdm(range(n_episodes)):
    state = env.reset()
    done = False
    episode_reward = 0
    episode_steps = 0

    while not done:
        # Epsilon-greedy action selection
        epsilon = max(1 - episode * epsilon_decay / (n_episodes - 1), 0.01)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            q_values = model.predict(state.reshape(1, -1))
            action = np.argmax(q_values[0])

        # Take action
        next_state, reward, done, _ = env.step(action)

        # Modify reward for the hole case if needed
        if negative_reward_enabled:
            if done and reward == 0:  # Optionally penalize the agent for falling in a hole
                reward = -1

        episode_reward += reward
        episode_steps += 1

        # Store in replay buffer
        target = reward if done else reward + gamma * np.max(model.predict(next_state.reshape(1, -1))[0])
        target_f = model.predict(state.reshape(1, -1))
        target_f[0][action] = target
        model.fit(state.reshape(1, -1), target_f, epochs=1, verbose=0)

        state = next_state

    total_rewards.append(episode_reward)
    total_steps.append(episode_steps)
    success_rate.append(int(episode_reward > 0))

    # Print metrics every 100 episodes
    if (episode + 1) % 100 == 0:
        average_reward = sum(total_rewards[-100:]) / 100
        average_steps = sum(total_steps[-100:]) / 100
        success_percentage = sum(success_rate[-100:]) / 100
        print(f"Episode: {episode + 1}, Avg Reward: {average_reward}, Avg Steps: {average_steps}, Success Rate: {success_percentage}")



  0%|          | 0/10000 [00:00<?, ?it/s]


AttributeError: 'int' object has no attribute 'reshape'

In [None]:
# Post training metrics
overall_average_reward = np.mean(total_rewards)
overall_average_steps = np.mean(total_steps)
overall_success_rate = np.mean(success_rate)

print('----------------------------------------------------------')
print("Overall Average reward:", overall_average_reward)
print("Overall Average number of steps:", overall_average_steps)
print("Success rate (%):", overall_success_rate * 100)
print('----------------------------------------------------------')

In [None]:
# Plotting metrics
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].plot(total_rewards, 'tab:green')
axs[0].set_title('Reward per Episode')
axs[1].plot(total_steps, 'tab:purple')
axs[1].set_title('Steps per Episode')
plt.show()