In [7]:
!pip install pygame
import gym
import collections
import numpy as np
from operator import itemgetter

def discretize(observation, bins):
    discrete_obs = [np.digitize(obs, bins[i]) for i, obs in enumerate(observation)]
    return tuple(discrete_obs)

def choose_action(Q, state, epsilon, env):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])


def update_Q(Q, alpha, state, action, reward, gamma, next_state):

    next_state_q = np.max(Q[next_state])
    state_q = Q[state+(action,)]
    Q[state+(action,)] = state_q + alpha * (reward + gamma * next_state_q - state_q)


def train(env, episodes=5000, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, alpha=0.1):
    num_bins = [6, 12, 12, 24]  # Define the number of bins for each observation
    bins = [
        np.linspace(-2.4, 2.4, num_bins[0]),
        np.linspace(-5, 5, num_bins[1]),
        np.linspace(-0.418, 0.418, num_bins[2]),
        np.linspace(-5, 5, num_bins[3])
    ]

    Q = np.random.uniform(low=0, high=1, size=(num_bins + [env.action_space.n]))
    rewards = []

    for episode in range(episodes):
        observation = env.reset()[0]
        state = discretize(tuple(observation), bins)
        total_reward = 0
        done = False

        while not done:
            action = choose_action(Q, state, epsilon, env)
            obs, reward, done, i, j = env.step(action)

            next_state = discretize(obs, bins)
            update_Q(Q, alpha, state, action, reward, gamma, next_state)
            state = next_state
            total_reward += reward

        rewards.append(total_reward)

        epsilon = max(epsilon_min, epsilon * epsilon_decay)


        if episode >= 100 and np.mean(rewards[-100:]) >= 195.0:
            print(f"Solved in {episode + 1} episodes! and avg reward (last 100 episodes): {np.mean(rewards[-100:]):.2f} ")
            break

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}, Average Reward (last 100 episodes): {np.mean(rewards[-100:]):.2f}")

env = gym.make('CartPole-v1')

train(env)

Episode 100, Average Reward (last 100 episodes): 25.67
Episode 200, Average Reward (last 100 episodes): 33.08
Episode 300, Average Reward (last 100 episodes): 51.30
Episode 400, Average Reward (last 100 episodes): 103.12
Episode 500, Average Reward (last 100 episodes): 108.07
Episode 600, Average Reward (last 100 episodes): 88.22
Episode 700, Average Reward (last 100 episodes): 84.71
Episode 800, Average Reward (last 100 episodes): 104.64
Episode 900, Average Reward (last 100 episodes): 111.84
Episode 1000, Average Reward (last 100 episodes): 107.19
Episode 1100, Average Reward (last 100 episodes): 102.07
Episode 1200, Average Reward (last 100 episodes): 121.59
Episode 1300, Average Reward (last 100 episodes): 120.53
Episode 1400, Average Reward (last 100 episodes): 120.39
Episode 1500, Average Reward (last 100 episodes): 123.75
Episode 1600, Average Reward (last 100 episodes): 120.23
Episode 1700, Average Reward (last 100 episodes): 110.66
Episode 1800, Average Reward (last 100 episod