In [1]:
!pip3 install gymnasium[classic-control]
!pip3 install renderlab
!pip3 install opencv-python
!pip3 install gymnasium[box2d]



In [5]:
from __future__ import annotations
from collections import defaultdict
import pickle
import renderlab as rl
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from tqdm import tqdm

error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu

In [6]:
def zero_factory():
    return np.zeros(env.action_space.n)
class Agent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        self.q_values = defaultdict(zero_factory)

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []
    def get_action(self, obs: tuple[int, int]) -> int:
        if np.random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))
    def update(
        self,
        obs: tuple[int, int],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int],
    ):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )
        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)
def save(obj, path):
    with open(path, 'bw') as f:
        pickle.dump(obj, f)
def load(path):
    with open(path, 'br') as f:
        return pickle.load(f)
def discretize(obs, decimal):
    return np.round(obs, decimals = decimal)

In [22]:
env = gym.make(
    "LunarLander-v2", render_mode="rgb_array",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
)
env = gym.wrappers.RecordEpisodeStatistics(env)
done = False
learning_rate = 0.01
n_episodes = 100000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)
final_epsilon = 0.1
num_chunks = 10
chunk_length = int(n_episodes / num_chunks)
rewards = np.zeros(chunk_length)
agent = Agent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)
#agent = load('./checkpoint')
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    obs = discretize(obs, 1)
    done = False
    while not done:
        action = agent.get_action(tuple(obs))
        next_obs, reward, terminated, truncated, info = env.step(action)
        next_obs = discretize(next_obs, 1)
        agent.update(tuple(obs), action, reward, terminated, tuple(next_obs))
        done = terminated or truncated
        obs = next_obs
    agent.decay_epsilon()
    rewards[episode % chunk_length] = info['episode']['r']
    if episode % chunk_length == 0 and episode > 0:
        print(f'Average rewards over the last {chunk_length} iterations {np.average(rewards)}')
        print(f'Max rewards over the last {chunk_length} iterations {np.max(rewards)}')
        save(agent, './checkpoint')
    
save(agent, './checkpoint')

env = rl.RenderFrame(env, './out')
for episode in range(7):
    obs, info = env.reset()
    obs = discretize(obs, 1)
    done = False
    while not done:
        action = agent.get_action(tuple(obs))
        next_obs, reward, terminated, truncated, info = env.step(action)
        next_obs = discretize(next_obs, 1)
        done = terminated or truncated
        obs = next_obs
env.play()
env.close()

 10%|██████████████▍                                                                                                                                 | 9999/100000 [14:45<2:21:38, 10.59it/s]

Average rewards over the last 10000 iterations -166.80437942905425
Max rewards over the last 10000 iterations 100.43367767333984


 20%|████████████████████████████▌                                                                                                                  | 20000/100000 [33:07<2:35:06,  8.60it/s]

Average rewards over the last 10000 iterations -158.85641777648925
Max rewards over the last 10000 iterations 132.18368530273438


 30%|██████████████████████████████████████████▉                                                                                                    | 30000/100000 [56:29<2:33:26,  7.60it/s]

Average rewards over the last 10000 iterations -171.90407308330535
Max rewards over the last 10000 iterations 107.49858856201172


 40%|████████████████████████████████████████████████████████▍                                                                                    | 40000/100000 [1:25:39<3:22:12,  4.95it/s]

Average rewards over the last 10000 iterations -185.94770835456848
Max rewards over the last 10000 iterations 282.68231201171875


 40%|████████████████████████████████████████████████████████▍                                                                                    | 40000/100000 [1:29:22<2:14:04,  7.46it/s]


MemoryError: 