In [None]:
from selenium import webdriver
import gymnasium as gym

import time
import os

%cd ..\RL
%pip install -e .
%load_ext autoreload
%autoreload 2

options = webdriver.ChromeOptions()
options.add_argument("--headless")  # No need for pyvirtualdisplay
options.add_argument("--window-size=1400,900")

driver = webdriver.Chrome(options=options)

In [2]:
from BC.infrastructure.notebook_util import *
from BC.scripts.bc_dagger_runner import run_training_loop

In [3]:
env = gym.make("CartPole-v1", render_mode="rgb_array")  # Needed for rendering
env = wrap_env(env)

env.reset()
for _ in range(100):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())  # Take random actions
    if terminated or truncated:
        obs, _ = env.reset()

env.close()

print('Loading video...')
show_video(env)

Loading video...
video/CartPole-v1*.mp4


In [5]:
env = gym.make("Ant-v5", render_mode="rgb_array")  # Needed for rendering
env = wrap_env(env)

env.reset()
for _ in range(100):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())  # Take random actions
    if terminated or truncated:
        obs, _ = env.reset()

env.close()

print('Loading video...')
show_video(env)

Loading video...
video/Ant-v5*.mp4


In [8]:
env = gym.make("HalfCheetah-v5", render_mode="rgb_array")  # Needed for rendering
env = wrap_env(env)

env.reset()
for _ in range(100):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())  # Take random actions
    if terminated or truncated:
        obs, _ = env.reset()

env.close()

print('Loading video...')
show_video(env)

Loading video...
video/HalfCheetah-v5*.mp4


## Behavior Cloning

In [4]:
class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  # expert data
  expert_policy_file:str = 'BC/policies/experts/Ant.pkl' 
  expert_data:str = 'BC/expert_data/expert_data_Ant-v4.pkl' 
  env_name:str = 'Ant-v4' # ['Ant-v4', 'Walker2d-v4', 'HalfCheetah-v4', 'Hopper-v4']
  exp_name:str = 'bc_ant' 
  do_dagger:bool = False
  ep_len:int = 1000
  save_params = False

  num_agent_train_steps_per_iter:int = 1000
  n_iter:int = 1

  # batches & buffers
  batch_size_initial:int = 2000 
  batch_size:int = 1000 
  eval_batch_size:int = 1000
  train_batch_size:int = 100
  max_replay_buffer_size:int = 1000_000

  # MLP
  n_layers:int = 2
  size:int = 64
  learning_rate:float = 5e-3
    
  # logging
  video_log_freq:int = 5 
  scalar_log_freq:int = 1 

  # gpu & run-time settings
  no_gpu:bool = False
  which_gpu:int = 0 
  seed:int = 42

args = Args()

params = vars(args)

if args.do_dagger:
    logdir_prefix = 'dagger_'
    assert args.n_iter > 1, "DAgger requires multiple iterations"  
else:
    logdir_prefix = 'behavior_cloning_'
    assert args.n_iter == 1, "Without DAgger, only one iteration is allowed"

data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
params['logdir'] = logdir
if not os.path.exists(logdir):
    os.makedirs(logdir)

In [5]:
print(args.logdir)

data\behavior_cloning_bc_ant_Ant-v4_02-02-2025_02-20-08


In [6]:
run_training_loop(args)

###########################
logging outputs to  data\behavior_cloning_bc_ant_Ant-v4_02-02-2025_02-20-08
###########################
Using CPU


  logger.deprecation(


Loading expert policy from... BC/policies/experts/Ant.pkl
obs (1, 111) (1, 111)
Expert policy loaded.


********** Iteration 0 ************

Collecting data to be used for training...

Training agent...

Logging training statistics...

Collecting video data...

Collecting data for eval metrics...
	Eval_AverageReturn: 4346.91796875
	Eval_StdReturn: 0.0
	Eval_MaxReturn: 4346.91796875
	Eval_MinReturn: 4346.91796875
	Eval_AverageEpLen: 1000.0
	Train_AverageReturn: 4681.891673935816
	Train_StdReturn: 30.70862278765526
	Train_MaxReturn: 4712.600296723471
	Train_MinReturn: 4651.18305114816
	Train_AverageEpLen: 1000.0
	Training Loss: -23.961570739746094
	Train_EnvstepsSoFar: 0
	TimeSinceStart: 30.374686241149902
	Initial_DataCollection_AverageReturn: 4681.891673935816
Done logging...




In [19]:
class Args:

  def __getitem__(self, key):
    return getattr(self, key)

  def __setitem__(self, key, val):
    setattr(self, key, val)

  # expert data
  expert_policy_file:str = 'BC/policies/experts/Ant.pkl' 
  expert_data:str = 'BC/expert_data/expert_data_Ant-v4.pkl' 
  env_name:str = 'Ant-v4' # ['Ant-v4', 'Walker2d-v4', 'HalfCheetah-v4', 'Hopper-v4']
  exp_name:str = 'bc_ant' 
  do_dagger:bool = True
  ep_len:int = 1000
  save_params = False

  num_agent_train_steps_per_iter:int = 1000
  n_iter:int = 5

  # batches & buffers
  batch_size_initial:int = 2000 
  batch_size:int = 1000 
  eval_batch_size:int = 1000
  train_batch_size:int = 100
  max_replay_buffer_size:int = 1000_000

  # MLP
  n_layers:int = 2
  size:int = 64
  learning_rate:float = 5e-3
    
  # logging
  video_log_freq:int = 5 
  scalar_log_freq:int = 1 

  # gpu & run-time settings
  no_gpu:bool = True
  which_gpu:int = 0 
  seed:int = 42

args = Args()

params = vars(args)

if args.do_dagger:
    logdir_prefix = 'dagger_'
    assert args.n_iter > 1, "DAgger requires multiple iterations"  
else:
    logdir_prefix = 'behavior_cloning_'
    assert args.n_iter == 1, "Without DAgger, only one iteration is allowed"

data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
logdir = os.path.join(data_path, logdir)
params['logdir'] = logdir
if not os.path.exists(logdir):
    os.makedirs(logdir)

In [20]:
run_training_loop(args)

###########################
logging outputs to  data\dagger_bc_ant_Ant-v4_02-02-2025_02-49-48
###########################
Using CPU
Loading expert policy from... BC/policies/experts/Ant.pkl
obs (1, 111) (1, 111)
Expert policy loaded.


********** Iteration 0 ************

Collecting data to be used for training...

Collecting initial video data...
(18, 3, 250, 250)
(16, 3, 250, 250)

Collecting initial eval metrics...
	Eval_AverageReturn: -175.1470184326172
	Eval_StdReturn: 140.67811584472656
	Eval_MaxReturn: -41.085819244384766
	Eval_MinReturn: -688.044189453125
	Eval_AverageEpLen: 51.75
	Train_AverageReturn: 4681.891673935816
	Train_StdReturn: 30.70862278765526
	Train_MaxReturn: 4712.600296723471
	Train_MinReturn: 4651.18305114816
	Train_AverageEpLen: 1000.0
	Train_EnvstepsSoFar: 0
	TimeSinceStart: 0.9910755157470703
Done logging...



Training agent...

Logging training statistics...

Collecting video data...
(1000, 3, 250, 250)
(1000, 3, 250, 250)

Collecting data for eval metrics.

In [18]:
%load_ext tensorboard
%tensorboard --logdir ./data

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 10500), started 0:22:54 ago. (Use '!kill 10500' to kill it.)