In [2]:
import gymnasium as gym 
import os
import numpy as np
import math

In [5]:
environment_name = "Acrobot-v1"
#env = gym.make(environment_name, render_mode='human')
env = gym.make(environment_name)

observation, info = env.reset()


# Initialize Q
space_size = env.observation_space.shape[0]
action_size = env.action_space.n
upper_bounds = [env.observation_space.high[0], env.observation_space.high[1], env.observation_space.high[2], env.observation_space.high[3], env.observation_space.high[4], env.observation_space.high[5]]
lower_bounds = [env.observation_space.low[0], env.observation_space.low[1], env.observation_space.low[2], env.observation_space.low[3], env.observation_space.low[4], env.observation_space.low[5]]
number_bins = 20
Q = np.random.randn(number_bins,number_bins,number_bins,number_bins,number_bins,number_bins,env.action_space.n)
epsilon = 0.15
alpha = 0.2
lambda_ = 0.95

# Select action base on e-greedy
def select_e_gready(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    
    return np.argmax(Q[state])


def discretize_state(observation):
    index_1 = np.argmin(np.abs(np.linspace(lower_bounds[0], upper_bounds[0], num=number_bins).tolist()-observation[0]))
    index_2 = np.argmin(np.abs(np.linspace(lower_bounds[1], upper_bounds[1], num=number_bins).tolist()-observation[1]))
    index_3 = np.argmin(np.abs(np.linspace(lower_bounds[2], upper_bounds[2], num=number_bins).tolist()-observation[2]))
    index_4 = np.argmin(np.abs(np.linspace(lower_bounds[3], upper_bounds[3], num=number_bins).tolist()-observation[3]))
    index_5 = np.argmin(np.abs(np.linspace(lower_bounds[4], upper_bounds[4], num=number_bins).tolist()-observation[4]))
    index_6 = np.argmin(np.abs(np.linspace(lower_bounds[5], upper_bounds[5], num=number_bins).tolist()-observation[5]))
    return index_1, index_2, index_3, index_4, index_5, index_6



episodes = 20000
ep_rewards = []
episode_data = 500
ep_rewards_table = {'ep':[], 'avg':[], 'min':[], 'max':[]}

for episode in range(episodes):
    state, info = env.reset()
    score = 0 
    
    d_state = discretize_state(state)
    action = select_e_gready(d_state, epsilon)
    
    while True:
        #action = env.action_space.sample()

        state_prime, reward, terminated, truncated, info = env.step(action)
        if terminated or truncated or score > 500:
            #print('Episode', episode, 'Score', score)
            break
        
        d_state_prime = discretize_state(state_prime)
        action_prime = select_e_gready(d_state_prime, epsilon)
        
       
        Q[d_state+(action,)] += alpha * (reward + lambda_ * Q[d_state_prime+(action_prime,)] - Q[d_state+(action,)])
        
        d_state = d_state_prime
        action = action_prime
        
        score += reward

    ep_rewards.append(score)
    
    if not episode % episode_data:
        avg_reward = sum(ep_rewards[-episode_data:])/len(ep_rewards[-episode_data:])
        ep_rewards_table['ep'].append(episode)
        ep_rewards_table['avg'].append(avg_reward)
        ep_rewards_table['min'].append(min(ep_rewards[-episode_data:]))
        ep_rewards_table['max'].append(max(ep_rewards[-episode_data:]))
        print(f"Episode:{episode} avg:{avg_reward} min:{min(ep_rewards[-episode_data:])} max:{max(ep_rewards[-episode_data:])}")


env.close()

Episode:0 avg:-499.0 min:-499.0 max:-499.0
Episode:500 avg:-491.332 min:-499.0 max:-236.0
Episode:1000 avg:-477.858 min:-499.0 max:-185.0
Episode:1500 avg:-465.054 min:-499.0 max:-178.0
Episode:2000 avg:-449.146 min:-499.0 max:-158.0
Episode:2500 avg:-433.634 min:-499.0 max:-192.0
Episode:3000 avg:-433.45 min:-499.0 max:-197.0
Episode:3500 avg:-416.36 min:-499.0 max:-131.0
Episode:4000 avg:-406.356 min:-499.0 max:-159.0
Episode:4500 avg:-403.25 min:-499.0 max:-172.0
Episode:5000 avg:-396.846 min:-499.0 max:-145.0
Episode:5500 avg:-377.334 min:-499.0 max:-146.0
Episode:6000 avg:-377.632 min:-499.0 max:-149.0
Episode:6500 avg:-365.428 min:-499.0 max:-155.0
Episode:7000 avg:-367.508 min:-499.0 max:-150.0
Episode:7500 avg:-365.518 min:-499.0 max:-136.0
Episode:8000 avg:-362.838 min:-499.0 max:-141.0
Episode:8500 avg:-354.904 min:-499.0 max:-123.0
Episode:9000 avg:-354.678 min:-499.0 max:-145.0


KeyboardInterrupt: 