In [None]:
import gymnasium as gym
import numpy as np
import random
import matplotlib.pyplot as plt
import time

In [None]:
# Question 1

def main():

    # create Taxi environment
    env = gym.make("Taxi-v3", render_mode="rgb_array")

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.6
    discount_rate = 0.9
    epsilon = 0.8
    decay_rate= 0

    # training variables
    num_episodes = 1000
    max_steps = 99 # per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state, _ = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state, _ = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        action = np.argmax(qtable[state,:])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        rewards += reward
        env.render()

        time.sleep(0.5)  # Add delay for visualization
        frame = env.render()
        plt.imshow(frame)
        plt.axis('off')
        plt.show()

        print(f"score: {rewards}")
        state = new_state

        if done:
            break

    env.close()

if __name__ == "__main__":
    main()

In [None]:
# Question 2

def main():

    # create Taxi environment
    env = gym.make("Taxi-v3", render_mode="rgb_array")

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.8
    epsilon = 1.0
    decay_rate= 0

    # training variables
    num_episodes = 1000
    max_steps = 99 # per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state, _ = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state, _ = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        action = np.argmax(qtable[state,:])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        rewards += reward
        env.render()

        time.sleep(0.5)  # Add delay for visualization
        frame = env.render()
        plt.imshow(frame)
        plt.axis('off')
        plt.show()

        print(f"score: {rewards}")
        state = new_state

        if done:
            break

    env.close()

if __name__ == "__main__":
    main()

In [None]:
# Question 3

def main():

    # create Taxi environment
    env = gym.make("Taxi-v3", render_mode="rgb_array")

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.8
    epsilon = 0.1  # small amount of exploration
    max_visits_per_state = 10

    # training variables
    num_episodes = 1000
    max_steps = 99  # per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state, _ = env.reset()
        done = False

        # initialize visit tracker for this episode
        visit_count = {}

        for s in range(max_steps):

            # track visits to each state
            visit_count[state] = visit_count.get(state, 0) + 1

            # if we've visited this state more than allowed, end episode early
            if visit_count[state] > max_visits_per_state:
                break

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            # Q-learning algorithm
            qtable[state, action] = qtable[state, action] + learning_rate * (
                reward + discount_rate * np.max(qtable[new_state, :]) - qtable[state, action]
            )

            # update to new state
            state = new_state

            # if episode ends, break
            if done:
                break

    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state, _ = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        # select best action from learned Q-table
        action = np.argmax(qtable[state, :])
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        rewards += reward

        # render frame
        frame = env.render()
        plt.imshow(frame)
        plt.axis('off')
        plt.show()
        time.sleep(0.5)

        print(f"score: {rewards}")
        state = new_state

        if done:
            break

    env.close()

if __name__ == "__main__":
    main()
