In [4]:
import gym
gym.__version__
# 0.26.2

'0.26.2'

In [5]:
# TEST gym environment
import gym
import numpy as np
import time

#env = gym.make('LunarLander')
#env = gym.make('CartPole-v0')
#env = gym.make('FrozenLake-v1')
env = gym.make('FrozenLake-v1', render_mode="human")

observation = env.reset(seed=42)
action_size = env.action_space.n
print('Number of actions:', action_size)
done = False
n_episodes = 1
for _ in range(n_episodes):
    while not done:
        action = np.random.randint(0, action_size)
        observation, reward, done, info, _ = env.step(action)
        print('Action:', action)
        print('Observation:', observation)
        print('Reward:', reward)
        env.render()
        time.sleep(0.1)
    done = False
    env.reset()
env.close()


Number of actions: 4
Action: 1
Observation: 4
Reward: 0.0
Action: 3
Observation: 4
Reward: 0.0
Action: 0
Observation: 8
Reward: 0.0
Action: 3
Observation: 9
Reward: 0.0
Action: 2
Observation: 5
Reward: 0.0


In [6]:
# Choose an action based on epsilon-greedy strategy
def choose_action(observation, q_table, epsilon):
    if np.random.random() < epsilon:
        action = np.random.randint(0, 4)  # Explore: choose a random action
    else:
        action = np.argmax(q_table[observation])  # Exploit: choose the action with the highest Q-value

    return action

# Define a Q-learning training function for a single episode
def q_learning_episode(env, q_table):
    # Define hyperparameters
    epsilon = 0.2  # Exploration probability
    alpha = 0.1    # Learning rate
    gamma = 0.9    # Discount factor

    done = False   # Initialize the 'done' flag (episode termination)
    steps = 0      # Initialize the step counter
    observation = env.reset(seed=42)[0]  # Reset the environment and get the initial observation

    # Run the episode until it's done
    while not done:
        # Choose action based on policy
        action = choose_action(observation, q_table, epsilon)

        # Perform the action and receive the new observation, reward, and done flag
        new_observation, reward, done, _, _= env.step(action)

        # Modify the reward based on the new observation
        if new_observation in [5, 7, 11, 12]:
            reward = -1
        elif new_observation == 15:
            reward = 10
        else:
            reward = -0.1

        # Calculate the maximum Q-value for the new observation
        new_observation_max = np.max(q_table[new_observation])

        # Update the Q-value in the Q-table using the Q-learning formula
        q_table[observation, action] += alpha * (reward + gamma * new_observation_max - q_table[observation, action])

        # Set the new observation as the current observation
        observation = new_observation

        # Increment the step counter
        steps += 1

        # Terminate the episode if it reaches 5000 steps
        if steps >= 5000:
            break

    return q_table


In [7]:
# Import required libraries
import gym
import numpy as np
import pandas as pd

# Define a function to train a Q-learning agent
def train_agent():
    
    # Create a FrozenLake environment with no slipperiness
    env = gym.make('FrozenLake-v1', is_slippery=False)
    
    # Get the number of actions and states in the environment
    number_of_actions = env.action_space.n
    number_of_states = env.observation_space.n

    # Initialize the Q-table with zeros
    q_table = np.array(np.zeros((number_of_states, number_of_actions)))
    
    # Print information about the shape of the Q-table
    print('Q Table - SHAPE:', q_table.shape)
    print("----------------")
    
    # Train the agent for a fixed number of episodes
    n_episodes = 200
    for e in range(n_episodes):
        q_table = q_learning_episode(env, q_table)
    
    # Return the final Q-table
    return q_table

# Train the agent and get the final Q-table
q_table = train_agent()

# Create a Pandas DataFrame from the Q-table and write it to a CSV file
df = pd.DataFrame(q_table, columns=['Left', 'Down', 'Right', 'Up']).round(3)
df.to_csv('q_table.csv', index=False, float_format="%.3f")

# Print the DataFrame
df


Q Table - SHAPE: (16, 4)
----------------


  if not isinstance(terminated, (bool, np.bool8)):


Unnamed: 0,Left,Down,Right,Up
0,1.522,5.48,-0.208,1.521
1,-0.173,-0.469,0.019,-0.165
2,-0.11,0.912,-0.101,-0.105
3,-0.095,-0.344,-0.095,-0.086
4,3.218,6.211,-0.85,0.961
5,0.0,0.0,0.0,0.0
6,-0.344,3.138,-0.271,-0.047
7,0.0,0.0,0.0,0.0
8,3.187,-0.613,7.017,1.44
9,3.506,7.909,2.614,-0.686


In [8]:
# Define a function to evaluate the performance of a Q-learning agent
def evaluate_q_learning(env, q_table):
    
    # Define a policy function that selects actions based on Q-values
    def policy(q_table, observation):
        action = np.argmax(q_table[observation])
        return action
    
    # Reset the environment and set the initial observation
    observation = env.reset(seed=42)[0]
    
    # Initialize variables for tracking the number of steps and whether the episode is done
    done = False
    steps = 0
    
    # Run the policy until the episode is complete
    while not done:
        
        # Select an action based on the policy
        action = policy(q_table, observation)
        
        # Take a step in the environment and get the resulting observation, reward, and done flag
        observation, reward, done, _, _ = env.step(action)
        
        # Render the environment
        env.render()
        
        # Increment the step counter and print information about the current step
        steps += 1
        print('Action: ', action)
        print('Observation: ', observation)
        print('Number of steps: ', steps)
    
    # Close the environment
    env.close()

# Create an instance of the FrozenLake environment with rendering enabled
env = gym.make('FrozenLake-v1', render_mode="human", is_slippery=False)

# Set the initial observation by resetting the environment
observation = env.reset(seed=42)

# Call the evaluate_q_learning function with the environment and a Q-table as arguments
evaluate_q_learning(env, q_table)


  if not isinstance(terminated, (bool, np.bool8)):


Action:  1
Observation:  4
Number of steps:  1
Action:  1
Observation:  8
Number of steps:  2
Action:  2
Observation:  9
Number of steps:  3
Action:  1
Observation:  13
Number of steps:  4
Action:  2
Observation:  14
Number of steps:  5
Action:  2
Observation:  15
Number of steps:  6
