# Parking with Hindsight Experience Replay

##  Warming up
We start with a few useful installs and imports:

In [None]:
#@title Install environment and agent
# !pip install highway-env
# TODO: we use the bleeding edge version because the current stable version does not support the latest gym>=0.21 versions. Revert back to stable at the next SB3 release.
# !pip install git+https://github.com/DLR-RM/stable-baselines3

# Environment
import gymnasium as gym

# Agent
from stable_baselines3 import HerReplayBuffer, SAC

In [None]:
#@title Import helpers for visualization of episodes
import sys
from tqdm.auto import trange
# !pip install tensorboardx gym pyvirtualdisplay
# !apt-get install -y xvfb ffmpeg
# !git clone https://github.com/Farama-Foundation/HighwayEnv.git 2> /dev/null
sys.path.insert(0, '/Users/anmol/githubRepos/CS269-Parking/scripts')
import highway_env

from utils import record_videos, show_videos

In [None]:
#@title Tensorboard - click the refresh button once training is running

%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
from highway_env.vehicle.kinematics import Vehicle
print("highway_env module file:", highway_env.__file__)
print("Vehicle.MAX_SPEED:", Vehicle.MAX_SPEED)

In [None]:
# Full configuration dictionary for environment
import numpy as np
parking_config = {
    # Observation configuration
    "observation": {
        "type": "KinematicsGoal",
        "features": ["x", "y", "vx", "vy", "cos_h", "sin_h"],
        "scales": [100, 100, 5, 5, 1, 1],
        "normalize": False,
    },
    
    # Action configuration
    "action": {
        "type": "ContinuousAction",
        "acceleration_range": (-2, 2),
        "speed_range": (-5, 5),
    },
    
    # Reward parameters
    "reward_weights": [1, 0.5, 0.01, 0.01, 1, 1],  # Weights for [x, y, vx, vy, cos_h, sin_h]
    "success_goal_reward": 0.15,
    "collision_reward": -5,
    
    # Vehicle control parameters
    "steering_range": np.deg2rad(60),  # Maximum steering angle in radians
    
    # Simulation parameters
    "simulation_frequency": 15,  # Hz
    "policy_frequency": 5,       # Hz
    "duration": 40,             # Maximum episode duration in steps
    
    # Rendering parameters
    "screen_width": 600,
    "screen_height": 300,
    "centering_position": [0.5, 0.5],
    "scaling": 7,
    "show_trajectories": True,
    
    # Environment setup
    "controlled_vehicles": 1,    # Number of vehicles to control
    "vehicles_count": 12,         # Number of parked vehicles (obstacles)
    "add_walls": True,           # Whether to add boundary walls

    # Additional parameters from AbstractEnv
    "offscreen_rendering": False,
    "manual_control": False,
    "real_time_rendering": False,
}

In [None]:
env = gym.make("parking-v0", render_mode="rgb_array", config=parking_config)
env = record_videos(env)
env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
env.close()
show_videos()

In [None]:
#@title Training

LEARNING_STEPS = 1e6 # @param {type: "number"}
env = gym.make('parking-v0', config=parking_config)
her_kwargs = dict(n_sampled_goal=4, goal_selection_strategy='future')
# model = SAC('MultiInputPolicy', env, replay_buffer_class=HerReplayBuffer,
#             replay_buffer_kwargs=her_kwargs, verbose=1, 
#             tensorboard_log="logs", 
#             buffer_size=int(1e6),
#             learning_rate=1e-3,
#             gamma=0.95, batch_size=1024, tau=0.05,
#             policy_kwargs=dict(net_arch=[512, 512, 512]),
#             learning_starts=1000,
#             )

model = SAC.load("/Users/anmol/githubRepos/CS269-Parking/scripts/model_20251129_005021_herRevParkEmpty.zip",env=env)
model.learning_starts = 10000
model.learn(int(LEARNING_STEPS))


In [None]:
# model = SAC.load("/Users/anmol/githubRepos/CS269-Parking/scripts/model_20251129_005021_herRevParkEmpty.zip",env=env)
# loaded params
# print(model.learning_rate)
# print(model.gamma)
# print(model.batch_size)
# print(model.tau)
# print(model.train_freq)
# print(model.policy_kwargs)

# updating params
# model.learning_rate = 3e-4
# model.gamma = 0.98
# model.batch_size = 1024
# model.tau = 0.05

# updated params
# print(model.learning_rate)
# print(model.gamma)
# print(model.batch_size)
# print(model.tau)
# print(model.train_freq)
# print(model.policy_kwargs)

In [None]:
#@title Visualize a few episodes
from tqdm.auto import trange

N_EPISODES = 10  # @param {type: "integer"}

env = gym.make('parking-v0', render_mode='rgb_array')
env = record_videos(env)
for episode in trange(N_EPISODES, desc="Test episodes"):
    obs, info = env.reset()
    done = truncated = False
    while not (done or truncated):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
env.close()
show_videos()