In [None]:
#!pip install gymnasium
#!pip install git+https://github.com/DLR-RM/stable-baselines3@feat/gymnasium-support

In [None]:
from os import path
import numpy as np

import gymnasium as gym
import matplotlib.pyplot as plt


TENSORBOARD_FOLDER = "tensorboard_reinforcementlearning"

%load_ext tensorboard

<img src="../Bilder/rl_overview.png" alt="Reinforcement Learning" style="width:700px;"/>

# Environment

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

In [None]:
env.reset(seed=42)
img = env.render()

plt.imshow(img)
plt.show()

In [None]:
env.action_space

In [None]:
env = gym.make("CartPole-v1", render_mode="human")

observation, info = env.reset(seed=42)

for _ in range(500):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    position, velocity, angle, angular_velocity = observation

    print(f"Position: {position:+.2f} | " + 
          f"Velocity: {velocity:+.2f} | " +
          f"Angle: {velocity:+.2f} | " +
          f"Angular Velocity: {velocity:+.2f}", end='\r')

    if terminated or truncated:
        observation, info = env.reset()
env.close()

# Policies
## Hard-coded policy

In [None]:
def show_one_episode(env, policy, n_max_steps=200, seed=42):
    frames = []
    np.random.seed(seed)
    obs, info = env.reset(seed=seed)
    for step in range(n_max_steps):
        frames.append(env.render())
        action = policy(obs)
        obs, reward, done, truncated, info = env.step(action)
        if done or truncated:
            break
    env.close()

In [None]:
def basic_policy(obs):
    position, velocity, angle, angular_velocity = obs
    return 0 if angle < 0 else 1

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
show_one_episode(env, basic_policy, n_max_steps=1000)

## Proximal Policy Optimization

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env


def ppo_cartpole_policy(obs):
    action, _ = model_cartpole.predict(obs)
    return action


env = make_vec_env("CartPole-v1", n_envs=4)

model_cartpole = PPO("MlpPolicy", env, verbose=1, tensorboard_log=path.join(TENSORBOARD_FOLDER, "ppo_CartPole"))
model_cartpole = model_cartpole.learn(total_timesteps=25000, progress_bar=True)

In [None]:
#%tensorboard --logdir ./ppo_cartpole_tensorboard

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
show_one_episode(env, ppo_cartpole_policy, n_max_steps=1000)

## CarRacing

In [None]:
env = gym.make("CarRacing-v2", render_mode="human")

observation, info = env.reset(seed=42)

for _ in range(500):
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        observation, info = env.reset()
env.close()

In [None]:
def ppo_carracing_policy(obs):
    action, _ = model_carracing.predict(obs)
    return action


env = make_vec_env("CarRacing-v2", n_envs=4)

model_carracing = PPO("MlpPolicy", env, verbose=1, tensorboard_log=path.join(TENSORBOARD_FOLDER, "ppo_CarRacing"))
# model_carracing = model_carracing.learn(total_timesteps=25000, progress_bar=True)

model_carracing = model_carracing.load("model_carracing")

In [None]:
env = gym.make("CarRacing-v2", render_mode="human")
show_one_episode(env, ppo_carracing_policy, n_max_steps=1000)

Besseres Modell hier zu finden: https://huggingface.co/meln1k/ppo-CarRacing-v0