# CS5756 Final Project: Safe Reinforcement Learning with Behavioral Cloning

## Setup

In [68]:
# Set Up:
import sys
%load_ext autoreload
%autoreload 2

import numpy as np
import gymnasium as gym
import random
import matplotlib.pyplot as plt
from copy import deepcopy

from torch.utils.data import DataLoader
from torch import nn
import torch
import cv2
from tqdm import tqdm, trange
from SafeLunarLanderWrapper import SafeLunarLanderWrapper
from utils import reseed, visualize, evaluate_policy

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO

from PPOActor import PPOActor

seed = 24
data_seed = 700

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
# Set seeds
# Setting the seed to ensure reproducability
reseed(seed)

In [70]:
vec_env_lander_10 = make_vec_env('LunarLander-v2', n_envs=10)
vec_env_lander_1 = make_vec_env('LunarLander-v2', n_envs=1)

vec_env_safe_lander_10 = make_vec_env('LunarLander-v2', n_envs=10, wrapper_class=SafeLunarLanderWrapper)
vec_env_safe_lander_1 = make_vec_env('LunarLander-v2', n_envs=1, wrapper_class=SafeLunarLanderWrapper)

In [71]:
base_ckpt = "ppo_base_lunar_lander"
safe_ckpt = "ppo_safe_lunar_lander"

In [185]:
# Training Expert Policy on Base Environment
model = PPO("MlpPolicy", vec_env_lander_10, verbose=1)

# Train the model
model.learn(total_timesteps=1000000)

# Save the model
model.save(base_ckpt)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.8     |
|    ep_rew_mean     | -174     |
| time/              |          |
|    fps             | 6456     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 40960    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -140        |
| time/                   |             |
|    fps                  | 3388        |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.009322455 |
|    clip_fraction        | 0.0815      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 0.00184     |
|    learning

In [None]:
# Training Safe Expert Policy on Safe Environment
safe_model = PPO("MlpPolicy", vec_env_safe_lander_10, verbose=1)

# Train the model
safe_model.learn(total_timesteps=1000000)

# Save the model
safe_model.save(safe_ckpt)

In [77]:
# Testing both policies in new environment with wind to test if they transferred learning of safe constraints

vec_env_base_wind_1 = make_vec_env('LunarLander-v2', n_envs=1, env_kwargs={"enable_wind": True,
    "wind_power": 20.0})

vec_env__safe_wind_1 = make_vec_env('LunarLander-v2', n_envs=1, wrapper_class=SafeLunarLanderWrapper, env_kwargs={"enable_wind": True,
    "wind_power": 20.0}, wrapper_kwargs={"debug": False})


safe_expert = PPOActor(ckpt=safe_ckpt, environment=vec_env__safe_wind_1)
base_expert = PPOActor(ckpt=base_ckpt, environment=vec_env_base_wind_1)


print("AVERAGE Base environment", evaluate_policy(base_expert, environment=vec_env_base_wind_1, num_episodes=10))
print("AVERAGE Safe environment", evaluate_policy(safe_expert, environment=vec_env__safe_wind_1, num_episodes=10))

# visualize(vec_env_1, algorithm=expert, video_name="test_expert")


100%|██████████| 10/10 [00:01<00:00,  6.59it/s]


AVERAGE Base environment 79.86199188232422


100%|██████████| 10/10 [00:01<00:00,  6.31it/s]

AVERAGE Safe environment 63.97650909423828



