In [22]:
import numpy as np
import gym
import random

# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=True)  # Use is_slippery=True for a more challenging environment

# Initialize Q-Table
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
q_table = np.zeros((state_space_size, action_space_size))

# Hyperparameters
alpha = 0.1     # Learning rate
gamma = 0.99    # Discount factor
epsilon = 1.0   # Initial exploration rate
epsilon_decay = 0.995
min_epsilon = 0.01
episodes = 1000
max_steps = 100

for episode in range(episodes):
    state = env.reset()
    
    # Handle if state is a tuple
    if isinstance(state, tuple):
        state = state[0]

    total_reward = 0

    for step in range(max_steps):
        # Convert state to int if necessary
        state = int(state)

        # Choose action using epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit

        # Convert action to int if necessary
        action = int(action)

        # Take action and observe the outcome
        next_state, reward, done, _, info = env.step(action)

        # Handle if next_state is a tuple
        if isinstance(next_state, tuple):
            next_state = next_state[0]

        # Convert next_state to int if necessary
        next_state = int(next_state)

        # Ensure state and action are within bounds
        if not (0 <= state < state_space_size):
            raise ValueError(f"State {state} is out of bounds.")
        if not (0 <= action < action_space_size):
            raise ValueError(f"Action {action} is out of bounds.")

        # Update Q-Table
        best_next_action = np.argmax(q_table[next_state])
        td_target = reward + gamma * q_table[next_state, best_next_action]
        td_error = td_target - q_table[state, action]
        q_table[state, action] += alpha * td_error

        state = next_state
        total_reward += reward

        if done:
            break

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon_decay * epsilon)

    # Print progress and debug information
    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")
        # Optional: Print Q-table
        # print("Q-table:", q_table)

print("Training completed.")


Episode 100/1000, Total Reward: 0.0
Episode 200/1000, Total Reward: 0.0
Episode 300/1000, Total Reward: 0.0
Episode 400/1000, Total Reward: 0.0
Episode 500/1000, Total Reward: 0.0
Episode 600/1000, Total Reward: 0.0
Episode 700/1000, Total Reward: 0.0
Episode 800/1000, Total Reward: 0.0
Episode 900/1000, Total Reward: 0.0
Episode 1000/1000, Total Reward: 0.0
Training completed.


In [19]:
import numpy as np
import gym

# Initialize the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=True)  # Use is_slippery=True if training was done with this setting

# Assuming q_table is already trained and available

# Hyperparameters for evaluation
num_episodes = 100  # Number of episodes for evaluation

# Initialize variables for evaluation
total_rewards = []
total_steps = []

for episode in range(num_episodes):
    state = env.reset()
    
    if isinstance(state, tuple):
        state = state[0]
    
    state = int(state)
    episode_reward = 0
    step = 0

    while True:
        # Choose action based on learned Q-table
        action = np.argmax(q_table[state])
        
        # Take action and observe the outcome
        next_state, reward, done, _, info = env.step(action)
        
        if isinstance(next_state, tuple):
            next_state = next_state[0]
        
        next_state = int(next_state)
        
        episode_reward += reward
        state = next_state
        step += 1
        
        if done:
            break

    total_rewards.append(episode_reward)
    total_steps.append(step)

# Print evaluation results
average_reward = np.mean(total_rewards)
average_steps = np.mean(total_steps)

print(f"Evaluation over {num_episodes} episodes:")
print(f"Average Reward: {average_reward:.2f}")
print(f"Average Steps: {average_steps:.2f}")

env.close()

Evaluation over 100 episodes:
Average Reward: 0.80
Average Steps: 39.83
