In [120]:
import gymnasium as gym 
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import KBinsDiscretizer

In [121]:
from sklearn.preprocessing import KBinsDiscretizer


class QLearningCartPole:
    """
    Q-learning Agent for the CartPole problem using OpenAI Gym.
    """

    def __init__(self, env, buckets=(2, 2, 6, 12), num_episodes=1000, min_lr=0.1, min_epsilon=0.1, discount=1.0, decay=25):
        """
        Initialize the Q-learning agent.

        Parameters:
        - buckets: Tuple defining the discretization for each state dimension.
        - num_episodes: Number of episodes for training.
        - min_lr: Minimum learning rate.
        - min_epsilon: Minimum exploration rate.
        - discount: Discount factor for future rewards.
        - decay: Rate at which learning and exploration rates decay.
        """

        # Setting the agent parameters
        self.buckets = buckets
        self.num_episodes = num_episodes
        self.min_lr = min_lr
        self.min_epsilon = min_epsilon
        self.discount = discount
        self.decay = decay
        self.env = env

        # Define bounds for discretizing the state space
        self.upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50) / 1.]
        self.lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], - math.radians(50) / 1.]

        # Initialize the Q-table with zeros
        self.Q_table = np.zeros(self.buckets + (self.env.action_space.n,))

    def discretize_state(self, obs):
        state, _ = obs 
        _, _, angle, angle_velocity = state
        est = KBinsDiscretizer(n_bins=self.buckets[2:], encode='ordinal', strategy='uniform')
        est.fit([self.lower_bounds[2:], self.upper_bounds[2:]])
        return tuple(map(int, est.transform([[angle, angle_velocity]])[0]))

    """
    OLD METHOD

    def discretize_state(self, obs):
        discretized = list()
        for i in range(len(obs)):
            scaling = (obs[i] + abs(self.lower_bounds[i])) / (self.upper_bounds[i] - self.lower_bounds[i])

            print((self.buckets[i] - 1) * scaling)

            new_obs = float(np.round((self.buckets[i] - 1) * scaling))
            new_obs = min(self.buckets[i] - 1, max(0, new_obs))
            discretized.append(new_obs)
        return tuple(discretized)
    """

    def choose_action(self, state, epsilon):
        """
        Choose an action based on the current state and epsilon (exploration rate).

        Parameters:
        - state: The current discretized state.
        - epsilon: Current exploration rate.

        Returns:
        - The chosen action.
        """
        if np.random.random() <= epsilon:
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.Q_table[state])

    def update_q(self, state, action, reward, new_state):
        """
        Update Q-value for the given state and action.

        Parameters:
        - state: The current discretized state.
        - action: The taken action.
        - reward: The received reward.
        - new_state: The new state after taking the action.
        """
        self.Q_table[state][action] += self.learning_rate * (reward + self.discount * np.max(self.Q_table[new_state]) - self.Q_table[state][action])

    def get_epsilon(self, t):
        """
        Compute epsilon (exploration rate) based on episode number.

        Parameters:
        - t: Current episode number.

        Returns:
        - The exploration rate (epsilon).
        """
        return max(self.min_epsilon, min(1., 1. - np.log10((t + 1) / self.decay)))

    def get_learning_rate(self, t):
        """
        Compute learning rate based on episode number.

        Parameters:
        - t: Current episode number.

        Returns:
        - The learning rate.
        """
        return max(self.min_lr, min(1., 1. - np.log10((t + 1) / self.decay)))

    def train(self):
        """
        Train the agent using Q-learning.
        """
        for e in range(self.num_episodes):
            current_state = self.discretize_state(self.env.reset())

            self.learning_rate = self.get_learning_rate(e)
            self.epsilon = self.get_epsilon(e)
            done = False

            while not done:
                self.env.render()
                action = self.choose_action(current_state, self.epsilon)

                print(self.env.step(action))
                items, dtype, _, _, _ = self.env.step(action)
                print(items)

                obs, reward, done, _ = items
                new_state = self.discretize_state(obs)
                self.update_q(current_state, action, reward, new_state)
                current_state = new_state

        print("Training finished.")
        self.env.close()


In [122]:
env = gym.make('CartPole-v1')
model = QLearningCartPole(env)
scores = model.train()

(array([ 0.02920011,  0.21637517,  0.03990792, -0.25328717], dtype=float32), 1.0, False, False, {})
[ 0.03352761  0.41090524  0.03484217 -0.5331201 ]


  gym.logger.warn(


TypeError: cannot unpack non-iterable numpy.float32 object

In [None]:
plt.plot(scores,  c='blue', label='epochs')
plt.legend()

NameError: name 'scores' is not defined