In [47]:
import gymnasium as gym
import highway_env

import torch
import torch.nn as nn
import gpytorch as gpt
import numpy as np
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm
from tqdm import trange

from arch.scripts.record_utils import *

In [48]:
from highway_env.envs.roundabout_env import RoundaboutEnv
from highway_env.vehicle.controller import MDPVehicle

class ContinuousRoundaboutEnv(RoundaboutEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        super().configure({
            "action": {
                "type": "ContinuousAction",
            },
            "observation": {
                "type": "Kinematics"
            }
        })

    def _rewards(self, action: np.ndarray) -> float:
        # we remove the lane_change_reward, which was `action in [0, 2]`

        return {
            "collision_reward": self.vehicle.crashed,
            "high_speed_reward": MDPVehicle.get_speed_index(self.vehicle)
            / (MDPVehicle.DEFAULT_TARGET_SPEEDS.size - 1),
            "on_road_reward": self.vehicle.on_road,
        }

In [49]:
env = ContinuousRoundaboutEnv(render_mode='rgb_array')
env.reset()

env, display = record_videos(env)

env.reset()
done = False
truncated = False
while not done and not truncated:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
    print("reward: ", reward)
    print("done: ", done)
    print("truncated: ", truncated)
    print("info: ", info)
display.stop()
del display
env.close()
show_videos()

[12/Dec/2023 21:53:04] INFO - Successfully started X with display ":1307".


  logger.warn(


reward:  0.0
done:  False
truncated:  False
info:  {'speed': 10.421455085277548, 'crashed': False, 'action': array([ 0.48429102, -0.88316995], dtype=float32), 'rewards': {'collision_reward': False, 'high_speed_reward': 0.0, 'on_road_reward': False}}
reward:  0.0
done:  False
truncated:  False
info:  {'speed': 11.866518139839155, 'crashed': False, 'action': array([0.2890126, 0.6204504], dtype=float32), 'rewards': {'collision_reward': False, 'high_speed_reward': 0.0, 'on_road_reward': False}}
reward:  0.0
done:  False
truncated:  False
info:  {'speed': 8.615130066871625, 'crashed': False, 'action': array([-0.6502776, -0.6716103], dtype=float32), 'rewards': {'collision_reward': False, 'high_speed_reward': 0.0, 'on_road_reward': False}}
reward:  0.0
done:  False
truncated:  False
info:  {'speed': 10.947243899106953, 'crashed': False, 'action': array([0.46642277, 0.9964922 ], dtype=float32), 'rewards': {'collision_reward': False, 'high_speed_reward': 0.0, 'on_road_reward': False}}
reward:  

                                                              

Moviepy - Done !
Moviepy - video ready /Users/neha/Documents/College/f23/CS378/videos/rl-video-episode-0.mp4
reward:  0.0
done:  True
truncated:  False
info:  {'speed': 5.715337835031261, 'crashed': True, 'action': array([-0.9819391 ,  0.26202238], dtype=float32), 'rewards': {'collision_reward': True, 'high_speed_reward': 0.0, 'on_road_reward': True}}


In [50]:
class RandomNormalNoisyObservation(gym.ObservationWrapper):
    """Adds random Normal noise to the observations of the environment."""

    def __init__(self, env, loc=0.0, scale=0.00005):
        """Initializes the :class:`RandomNormalNoisyObservation` wrapper.

        Args:
            env (gym.Env): The environment to apply the wrapper
            loc (float, optional): Mean ("centre") of the noise distribution.
                Defaults to 0.0.
            scale (float, optional): Standard deviation (spread or "width") of the noise distribution.
                Must be non-negative. Defaults to 0.0001.
        """
        super().__init__(env)
        self.loc = loc
        self.scale = scale

    def observation(self, observation):
        """Returns the potentially modified observation."""
        return observation + np.random.normal(loc=self.loc, scale=self.scale, size=observation.shape)

In [75]:
# !git clone --depth=1 --branch=master https://github.com/liuzuxin/safe-mbrl.git safembrl
# !rm -rf safembrl/.git
# !touch safembrl/__init__.py

In [51]:
mpc_config = {
    'optimizer': 'CEM',
    'horizon': 8,
    'gamma': 0.98,
    'RANDOM': {
        'popsize': 5000,
    },
    'CEM': {
        'popsize': 500,
        'max_iters': 5,
        'num_elites': 10,
        'epsilon': 0.01,
        'alpha': 0.1,
        'init_mean': 0,
        'init_var': 1,
    },

}

cost_config = {
    'model_param': {
        'boosting_type': 'gbdt',
        'learning_rate': 0.3,
        'max_depth': 8,
        'n_estimators': 400,
        'n_jobs': 1,
        'num_leaves': 12,
    },
    'max_ratio': 3,
    'unsafe_buffer_size': 10000,
    'safe_buffer_size': 50000,
    'batch': 2000,
    'save': False,
    'save_folder': None,
    'load': False,
    'load_folder': None,
    'state_size': 25,
    'action_size': 2,
}

dynamic_config = {
    'n_ensembles': 5,
    'data_split': 0.8,
    'n_epochs': 70,
    'activation': 'relu',
    'batch_size': 256,
    'buffer_size': 500000,
    'hidden_sizes': [1024, 1024, 1024],
    'learning_rate': 0.001,
    'test_freq': 5,
    'test_ratio': 0.15,
    'load': False,
    'load_folder': None,
    'save': False,
    'save_folder': None,
}

env = ContinuousRoundaboutEnv()
env = RandomNormalNoisyObservation(env)
env.reset()

from safembrl.mbrl import SafeMPC, RegressionModelEnsemble, CostModel

STATE_DIM = 25
ACTION_DIM = 2

dynamic_model = RegressionModelEnsemble(STATE_DIM+ACTION_DIM, STATE_DIM, config=dynamic_config)
cost_model = CostModel(env, config=cost_config)
mpc_controller = SafeMPC(env, mpc_config, cost_model=cost_model, n_ensembles=dynamic_config['n_ensembles'])

In [52]:
Transition = namedtuple('Transition', ['state', 'action', 'next_state', 'safe_next_state', 'rewards'])

def collect_interaction_data(env, cost_model, dynamics_model, size=1000):
    done = True
    truncated = False
    for i in trange(size, desc="Collecting interaction data"):
        if done or truncated:
            previous_obs, info = env.reset()

        action = env.action_space.sample()
        obs, reward, done, truncated, info = env.step(action)
        safe_next_state = 1 if info['rewards']['on_road_reward'] and not info['rewards']['collision_reward'] else 0
        data = Transition(torch.Tensor(previous_obs),
                                torch.Tensor(action),
                                torch.Tensor(obs),
                                torch.Tensor([safe_next_state]),
                                torch.Tensor([reward]))
        previous_obs = obs

        cost_model.add_data_point(data.next_state, 1 if data.safe_next_state == 0 else 0)
        x, y = np.concatenate((data.state.flatten(), data.action)), data.next_state.flatten()
        dynamics_model.add_data_point(x, y)
        
    return data

env = ContinuousRoundaboutEnv(render_mode='rgb_array')
env = RandomNormalNoisyObservation(env)
env.reset()
collect_interaction_data(env, cost_model, dynamic_model)

Collecting interaction data: 100%|██████████| 1000/1000 [00:17<00:00, 56.56it/s]


Transition(state=tensor([[ 1.0001e+00, -1.5704e-01,  9.9999e-01, -1.0621e-01,  8.3245e-02],
        [ 1.0000e+00,  1.4651e-01,  6.0666e-02,  1.1076e-01,  9.8168e-02],
        [-5.5372e-06, -1.6659e-06,  1.0253e-04,  7.5835e-05,  3.3696e-05],
        [ 7.8970e-05, -5.7181e-05,  3.6129e-05, -2.2790e-05,  2.9797e-05],
        [ 3.1186e-05,  2.4308e-06, -4.4669e-05, -6.8315e-05,  4.0842e-05]]), action=tensor([-0.2454, -0.7050]), next_state=tensor([[ 1.0001e+00, -1.5906e-01,  9.9988e-01,  3.5605e-02,  1.1422e-01],
        [ 1.0000e+00,  1.4902e-01,  1.0000e+00, -3.5159e-02,  7.0790e-02],
        [-4.1030e-05, -8.0414e-05,  4.9228e-06, -2.8079e-05, -1.6980e-05],
        [-7.0110e-06, -6.1815e-05,  2.0318e-05, -4.9781e-05,  7.3847e-05],
        [ 3.0159e-05, -3.0420e-06, -2.8419e-05,  5.9205e-06, -9.7853e-06]]), safe_next_state=tensor([0.]), rewards=tensor([0.]))

In [53]:
dynamic_model.reset_model()
dynamic_model.fit(use_data_buf=True, normalize=True)
cost_model.fit()

return data util  1000
[4/70],loss train m: 0.6256, v: 0.0001, test m: 0.7306, v: 0.0000
[9/70],loss train m: 0.5146, v: 0.0000, test m: 0.6841, v: 0.0000
[14/70],loss train m: 0.4374, v: 0.0001, test m: 0.6781, v: 0.0000
[19/70],loss train m: 0.3819, v: 0.0000, test m: 0.6689, v: 0.0001
[24/70],loss train m: 0.3360, v: 0.0000, test m: 0.6575, v: 0.0000
[29/70],loss train m: 0.2932, v: 0.0000, test m: 0.6611, v: 0.0001
[34/70],loss train m: 0.2553, v: 0.0001, test m: 0.6551, v: 0.0001
[39/70],loss train m: 0.2267, v: 0.0001, test m: 0.6594, v: 0.0002
[44/70],loss train m: 0.2005, v: 0.0001, test m: 0.6655, v: 0.0001
[49/70],loss train m: 0.1815, v: 0.0000, test m: 0.6699, v: 0.0008
[54/70],loss train m: 0.1566, v: 0.0000, test m: 0.6736, v: 0.0006
[59/70],loss train m: 0.1388, v: 0.0000, test m: 0.6729, v: 0.0002
[64/70],loss train m: 0.1243, v: 0.0000, test m: 0.6752, v: 0.0002
[LightGBM] [Info] Number of positive: 773, number of negative: 227
[LightGBM] [Info] Auto-choosing col-wise 

In [55]:
import time
from safembrl.utils.logx import EpochLogger
from safembrl.utils.run_utils import setup_logger_kwargs, combined_shape, load_config, seed_torch

logger_kwargs = setup_logger_kwargs('safembrl', 24, './logs')
logger = EpochLogger(**logger_kwargs)

env = ContinuousRoundaboutEnv()
env = RandomNormalNoisyObservation(env)
env.reset()

start_time = time.time()
# Main loop: collect experience in env and update/log each epoch
total_len = 0 # total interactions
total_epi = 0
for epoch in tqdm(range(5), desc='Epoch'): # update models per epoch
    for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length
        obs, _, = env.reset()
        ep_ret, ep_cost, done = 0, 0, False
        obs = obs.flatten()
        mpc_controller.reset()
        done, truncated = False, False
        # with tqdm(total=11, desc='Collecting interactions') as pbar:
        while not done and not truncated:
            action = np.squeeze(np.array([mpc_controller.act(model=dynamic_model, state=obs)]))
            obs_next, reward, done, truncated, info = env.step(action)
            # print("reward: ", reward)
            # print("done: ", done)
            # print("truncated: ", truncated)
            # print("info: ", info)
            total_len += 1
            ep_ret += reward
            if not truncated and not done:
                x = np.concatenate((obs.flatten(), action))
                y = obs_next.flatten() #- obs
                dynamic_model.add_data_point(x, y)
                cost = 1 if reward == 0 else 0
                ep_cost += cost
                cost_model.add_data_point(obs_next, cost)
            obs = obs_next
                # pbar.update(1)
        logger.store(Rewards=ep_ret, Cost=ep_cost)
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('Episode', total_epi)
        logger.log_tabular('Cost', average_only=True)
        logger.log_tabular('TotalEnvInteracts', total_len)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
        total_epi += 1
    # training the model
    dynamic_model.fit(use_data_buf=True, normalize=True)
    cost_model.fit()
env.close()

[32;1mLogging data to ./logs/safembrl/safembrl_s24/progress.txt[0m


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm(range(5), desc='Epoch'): # update models per epoch


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length


Collecting episodes:   0%|          | 0/30 [00:00<?, ?it/s]

---------------------------------------
|             Epoch |               0 |
|           Episode |               0 |
|              Cost |               5 |
| TotalEnvInteracts |              11 |
|              Time |            11.4 |
---------------------------------------
---------------------------------------
|             Epoch |               0 |
|           Episode |               1 |
|              Cost |               5 |
| TotalEnvInteracts |              22 |
|              Time |            22.5 |
---------------------------------------
---------------------------------------
|             Epoch |               0 |
|           Episode |               2 |
|              Cost |               7 |
| TotalEnvInteracts |              33 |
|              Time |            33.8 |
---------------------------------------
---------------------------------------
|             Epoch |               0 |
|           Episode |               3 |
|              Cost |               2 |


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length


Collecting episodes:   0%|          | 0/30 [00:00<?, ?it/s]

---------------------------------------
|             Epoch |               1 |
|           Episode |              30 |
|              Cost |               5 |
| TotalEnvInteracts |             258 |
|              Time |             275 |
---------------------------------------
---------------------------------------
|             Epoch |               1 |
|           Episode |              31 |
|              Cost |               0 |
| TotalEnvInteracts |             262 |
|              Time |             280 |
---------------------------------------
---------------------------------------
|             Epoch |               1 |
|           Episode |              32 |
|              Cost |               2 |
| TotalEnvInteracts |             273 |
|              Time |             292 |
---------------------------------------
---------------------------------------
|             Epoch |               1 |
|           Episode |              33 |
|              Cost |               2 |


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length


Collecting episodes:   0%|          | 0/30 [00:00<?, ?it/s]

---------------------------------------
|             Epoch |               2 |
|           Episode |              60 |
|              Cost |               4 |
| TotalEnvInteracts |             573 |
|              Time |             624 |
---------------------------------------
---------------------------------------
|             Epoch |               2 |
|           Episode |              61 |
|              Cost |               2 |
| TotalEnvInteracts |             584 |
|              Time |             636 |
---------------------------------------
---------------------------------------
|             Epoch |               2 |
|           Episode |              62 |
|              Cost |               5 |
| TotalEnvInteracts |             595 |
|              Time |             648 |
---------------------------------------
---------------------------------------
|             Epoch |               2 |
|           Episode |              63 |
|              Cost |               5 |


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length


Collecting episodes:   0%|          | 0/30 [00:00<?, ?it/s]

---------------------------------------
|             Epoch |               3 |
|           Episode |              90 |
|              Cost |               1 |
| TotalEnvInteracts |             902 |
|              Time |             990 |
---------------------------------------
---------------------------------------
|             Epoch |               3 |
|           Episode |              91 |
|              Cost |               0 |
| TotalEnvInteracts |             913 |
|              Time |           1e+03 |
---------------------------------------
---------------------------------------
|             Epoch |               3 |
|           Episode |              92 |
|              Cost |               3 |
| TotalEnvInteracts |             924 |
|              Time |        1.02e+03 |
---------------------------------------
---------------------------------------
|             Epoch |               3 |
|           Episode |              93 |
|              Cost |               2 |


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for test_episode in tqdm(range(30), desc='Collecting episodes'): # collect data for episodes length


Collecting episodes:   0%|          | 0/30 [00:00<?, ?it/s]

---------------------------------------
|             Epoch |               4 |
|           Episode |             120 |
|              Cost |               0 |
| TotalEnvInteracts |        1.23e+03 |
|              Time |        1.38e+03 |
---------------------------------------
---------------------------------------
|             Epoch |               4 |
|           Episode |             121 |
|              Cost |               1 |
| TotalEnvInteracts |        1.24e+03 |
|              Time |        1.39e+03 |
---------------------------------------
---------------------------------------
|             Epoch |               4 |
|           Episode |             122 |
|              Cost |               1 |
| TotalEnvInteracts |        1.25e+03 |
|              Time |        1.41e+03 |
---------------------------------------
---------------------------------------
|             Epoch |               4 |
|           Episode |             123 |
|              Cost |               0 |


In [57]:
env = ContinuousRoundaboutEnv(render_mode='rgb_array')
env = RandomNormalNoisyObservation(env)
env.reset()

env, display = record_videos(env)

obs, _ = env.reset()
done = False
truncated = False
while not done and not truncated:
    action = mpc_controller.act(dynamic_model, obs.flatten())
    obs, reward, done, truncated, info = env.step(action)

display.stop()
del display
env.close()

show_videos()

[12/Dec/2023 22:55:46] INFO - Successfully started X with display ":1313".


  logger.warn(


Moviepy - Building video /Users/neha/Documents/College/f23/CS378/videos/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/neha/Documents/College/f23/CS378/videos/rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /Users/neha/Documents/College/f23/CS378/videos/rl-video-episode-0.mp4
