<a href="https://colab.research.google.com/github/diamantidisgeorgios/exercise9/blob/main/exercise_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install libraries

In [None]:
!pip install swig

In [None]:
!pip install gymnasium

In [None]:
!pip install box2d

In [None]:
!pip install renderlab

#Basic functions

Every gymnasium environment has 3 basic functions: step, reset, and render.
Step: updates an environment with actions.
Reset: Resets the environment to an initial state.
Render: Renders the environment.

#LunarLander-v3 enviroment
##-Observation space
It is an 8-dimensional vector with information about the current state of the lunar lander. It has 1.The coordinates of x, 2.The coordinates of y, 3.The linear velocity of x, 4.The linear velocity of y, 5.its angle, 6. its angular velocity, 7.Boolean that represents if a the left leg is in contact with the ground or not, 8.Boolean that represents if a the right leg is in contact with the ground or not
##-Action space
It tells the lunar lander what to do. It has four discrete actions. 1.Do nothing, 2.Fire left orientation engine, 3.Fire main engine,4.Fire right orientation engine
##-Reward Function
After every step a reward is given.

#Create a random agent and visualize its gameplay

In [None]:
import renderlab as rl
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="rgb_array")
env = rl.RenderFrame(env, './output')

osbservation,info = env.reset()
while True:
  action = env.action_space.sample()
  osbservation, reward, terminated, truncated, info = env.step(action)

  if terminated or truncated:
    break

env.play()


#Run the random agent 5 times and get its mean reward

In [None]:
import numpy as np
env = gym.make("LunarLander-v3", render_mode="rgb_array")
rewards = []
osbservation,info = env.reset()
k=5
for i in range(k):
  action = env.action_space.sample()
  osbservation, reward, terminated, truncated, info = env.step(action)
  rewards.append(reward)
  print(f"Reward for episode {i}: {reward}")
  if terminated or truncated:
    continue

print(f"Mean reward: {np.mean(rewards)}")


#Install stable-baselines3

In [None]:
!pip install stable-baselines3

#Use the DQN algorithm

In [None]:
from stable_baselines3 import DQN
import time

dqn_env = gym.make("LunarLander-v3", render_mode="rgb_array")
start = time.time()
dqn_model = DQN("MlpPolicy", dqn_env, verbose=1)
dqn_model.learn(total_timesteps=20000, log_interval=4, progress_bar=True)
dqn_time = time.time() - start



In [None]:
dqn_rewards = []
#k = 5
for i in range(k):
    total_reward = 0
    obs, info = dqn_env.reset()
    while True:
      action, _states = dqn_model.predict(obs)
      obs, dqn_reward, terminated, truncated, info = dqn_env.step(action)
      total_reward += dqn_reward

      if terminated or truncated:
        dqn_rewards.append(total_reward)
        print(f"Reward for episode {i}: {total_reward}")
        break

print(f"Mean reward: {np.mean(dqn_rewards)}")


#Use the PPO algorithm

In [None]:
from stable_baselines3 import PPO

ppo_env = gym.make("LunarLander-v3", render_mode="rgb_array")
start = time.time()
ppo_model = PPO("MlpPolicy", ppo_env, verbose=1)
ppo_model.learn(total_timesteps=20000, progress_bar=True)
ppo_time = time.time() - start

In [None]:
ppo_rewards = []
#k = 5
for i in range(k):
    total_reward = 0
    obs, info = ppo_env.reset()
    while True:
      action, _states = ppo_model.predict(obs)
      obs, ppo_reward, terminated, truncated, info = ppo_env.step(action)
      total_reward += ppo_reward

      if terminated or truncated:
        ppo_rewards.append(total_reward)
        print(f"Reward for episode {i}: {total_reward}")
        break

print(f"Mean reward: {np.mean(ppo_rewards)}")

#Plot the time it took to train for each algorithm

In [None]:
import matplotlib.pyplot as plt

plt.bar(["DQN", "PPO"], [dqn_time, ppo_time])
plt.title("Training time")
plt.ylabel("Seconds")
plt.show()

#Plot rewards per episode for each algorithm

In [None]:
plt.plot(dqn_rewards, label="DQN")
plt.plot(ppo_rewards, label="PPO")
plt.title("Rewards per episode")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.show()

#Redoing the previous steps but with a different parameters for the algorithms

In [None]:

dqn_env = gym.make("LunarLander-v3", render_mode="rgb_array")
start = time.time()
dqn_model = DQN(
    "MlpPolicy",
    dqn_env,
    buffer_size=200_000,
    learning_starts=10_000,
    batch_size=128,
    gamma=0.99,
    train_freq=4,
    target_update_interval=1_000,
)
dqn_model.learn(total_timesteps=500000, log_interval=400, progress_bar=True)
dqn_time = time.time() - start

dqn_rewards = []
#k = 5
for i in range(k):
    total_reward = 0
    obs, info = dqn_env.reset()
    while True:
      action, _states = dqn_model.predict(obs)
      obs, dqn_reward, terminated, truncated, info = dqn_env.step(action)
      total_reward += dqn_reward

      if terminated or truncated:
        dqn_rewards.append(total_reward)
        print(f"Reward for episode {i}: {total_reward}")
        break

print(f"Mean reward: {np.mean(dqn_rewards)}")



In [None]:
ppo_env = gym.make("LunarLander-v3", render_mode="rgb_array")
start = time.time()
ppo_model = PPO(
    "MlpPolicy",
    ppo_env,
    n_steps=2048,
    batch_size=64,
    learning_rate=3e-4,
    gamma=0.99,
)
ppo_model.learn(total_timesteps=200000, progress_bar=True)
ppo_time = time.time() - start

ppo_rewards = []
#k = 5
for i in range(k):
    total_reward = 0
    obs, info = ppo_env.reset()
    while True:
      action, _states = ppo_model.predict(obs)
      obs, ppo_reward, terminated, truncated, info = ppo_env.step(action)
      total_reward += ppo_reward

      if terminated or truncated:
        ppo_rewards.append(total_reward)
        print(f"Reward for episode {i}: {total_reward}")
        break

print(f"Mean reward: {np.mean(ppo_rewards)}")

In [None]:
plt.bar(["DQN", "PPO"], [dqn_time, ppo_time])
plt.title("Training time")
plt.ylabel("Seconds")
plt.show()

In [None]:
plt.plot(dqn_rewards, label="DQN")
plt.plot(ppo_rewards, label="PPO")
plt.title("Rewards per episode")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.show()