In [None]:
import gym

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Set random seed for reproducibility
env.seed(42)

# Q-learning algorithm
def q_learning(env, num_episodes=1000, learning_rate=0.1, discount_factor=0.99, exploration_prob=0.2):
    num_actions = env.action_space.n
    num_states = 4  # For CartPole-v1, the observation space has 4 features

    # Initialize the Q-table with zeros
    q_table = np.zeros((num_states, num_actions))

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            # Exploration vs Exploitation trade-off
            if np.random.rand() < exploration_prob:
                action = env.action_space.sample()  # Explore - choose a random action
            else:
                action = np.argmax(q_table[state])  # Exploit - choose the best action

            next_state, reward, done, _ = env.step(action)

            # Q-learning update
            q_table[state, action] += learning_rate * (reward + discount_factor * np.max(q_table[next_state]) - q_table[state, action])

            state = next_state

    return q_table

# Main loop to run the Q-learning algorithm
if __name__ == "__main__":
    import numpy as np

    # Run the Q-learning algorithm
    q_table = q_learning(env)

    # Test the learned policy
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, _ = env.step(action)
        total_reward += reward
        env.render()

    print(f"Total reward: {total_reward}")
    env.close()
