In [53]:
import gymnasium as gym 
import numpy as np
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

In [54]:
class QLearningCartPole:
    """
    Q-learning Agent for the CartPole problem using OpenAI Gym.
    """

    def __init__(self, buckets=(1, 1, 6, 12), num_episodes=1000, min_lr=0.1, min_epsilon=0.1, discount=1.0, decay=25):
        """
        Initialize the Q-learning agent.

        Parameters:
        - buckets: Tuple defining the discretization for each state dimension.
        - num_episodes: Number of episodes for training.
        - min_lr: Minimum learning rate.
        - min_epsilon: Minimum exploration rate.
        - discount: Discount factor for future rewards.
        - decay: Rate at which learning and exploration rates decay.
        """

        # Setting the agent parameters
        self.buckets = buckets
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay

        # Initialize the CartPole environment
        self.env = gym.make('CartPole-v1')

        # Define bounds for discretizing the state space
        self.upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], np.math.radians(50) / 1.]
        self.lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -np.math.radians(50) / 1.]

        # Initialize the Q-table with zeros
        self.Q_table = np.zeros(self.buckets + (self.env.action_space.n,))

    def discretize_state(self, obs):
        """
        Convert continuous state to a discretized state.
        
        Parameters:
        - obs: A list of continuous observations.

        Returns:
        - discretized: A tuple representing the discretized state.
        """
        discretized = list()
        for i in range(len(obs)):
            scaling = (obs[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i])
            new_obs = int(round((self.buckets[i] - 1) * scaling))
            new_obs = min(self.buckets[i] - 1, max(0, new_obs))
            discretized.append(new_obs)
        return tuple(discretized)

    def choose_action(self, state, epsilon):
        """
        Choose an action based on the current state and epsilon (exploration rate).

        Parameters:
        - state: The current discretized state.
        - epsilon: Current exploration rate.

        Returns:
        - The chosen action.
        """
        if np.random.random() <= epsilon:
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])

    def update_q(self, state, action, reward, new_state):
        """
        Update Q-value for the given state and action.

        Parameters:
        - state: The current discretized state.
        - action: The taken action.
        - reward: The received reward.
        - new_state: The new state after taking the action.
        """
        self.Q_table[state][action] += self.learning_rate * (reward + self.discount * np.max(self.Q_table[new_state]) - self.Q_table[state][action])

    def get_epsilon(self, t):
        """
        Compute epsilon (exploration rate) based on episode number.

        Parameters:
        - t: Current episode number.

        Returns:
        - The exploration rate (epsilon).
        """
        return max(self.min_epsilon, min(1., 1. - np.log10((t + 1) / self.decay)))

    def get_learning_rate(self, t):
        """
        Compute learning rate based on episode number.

        Parameters:
        - t: Current episode number.

        Returns:
        - The learning rate.
        """
        return max(self.min_lr, min(1., 1. - np.log10((t + 1) / self.decay)))

    def train(self):
        """
        Train the agent using Q-learning.
        """
        for e in range(self.num_episodes):
            current_state = self.discretize_state(self.env.reset())

            self.learning_rate = self.get_learning_rate(e)
            self.epsilon = self.get_epsilon(e)
            done = False

            while not done:
                self.env.render()
                action = self.choose_action(current_state, self.epsilon)
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretize_state(obs)
                self.update_q(current_state, action, reward, new_state)
                current_state = new_state

        print("Training finished.")
        self.env.close()

if __name__ == "__main__":
    # If this script is the main program, initialize the Q-learning agent and train it.
    agent = QLearningCartPole()
    agent.train()


'\nenv = gym.make("CartPole-v1", render_mode="human")\nobservation, info = env.reset(seed=42)\nfor _ in range(500):\n   action = env.action_space.sample()  # this is where you would insert your policy\n   observation, reward, terminated, truncated, info = env.step(action)\n\n   if terminated or truncated:\n      observation, info = env.reset()\n\nenv.close()\n'