In [None]:
import datetime
from ray import train, tune, air
from ray.rllib.algorithms.sac import SACConfig
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

config = (
    SACConfig()
    .environment(
        env=ReachEnvGym,
        env_config={"log_dir": LOGDIR},        
    )
    .training(
        initial_alpha=0.2,
        actor_lr=1e-3,
        critic_lr=1e-3,
        alpha_lr=1e-3,
        target_entropy="auto",
        n_step=1,
        tau=0.005,
        train_batch_size=128,
        target_network_update_freq=1,
        replay_buffer_config={
            "type": "EpisodeReplayBuffer",
            "capacity": 1000000,
        },
        num_steps_sampled_before_learning_starts=1000,
        model={
            "fcnet_hiddens": [512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "post_fcnet_weights_initializer": "orthogonal_",
            "post_fcnet_weights_initializer_config": {"gain": 0.01},
        },
    )
    .resources(
        num_gpus=0.25,      # 或 0.25 视机器配置
        num_cpus_per_worker=1,
        num_learner_workers=1,
    )
    .framework("torch")
    .reporting(
        metrics_num_episodes_for_smoothing=5,
        min_sample_timesteps_per_iteration=1000,
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_num_env_runners=1,
        evaluation_config={"seed": 22},
    )
    .env_runners(
        num_env_runners=6,             # 进程数量
        num_envs_per_env_runner=1,     # 环境数量
        # gym_env_vectorize_mode="ASYNC"
    )
)


tunner = tune.Tuner(
    trainable=config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        name="reach",
        storage_path=LOGDIR,
        log_to_file=True,
        checkpoint_config=air.CheckpointConfig(
            checkpoint_frequency=10,
            checkpoint_at_end=True,
        ),
        stop={"evaluation/env_runners/episode_return_mean": 18000.0}
    ),
)

results = tunner.fit()

In [1]:
l1 = [123,234,456,789,910,101]
l2 = [111,222,333,444,555,666]
l1[:3] = l2[-3:]
print(l1)
print(l2)

[444, 555, 666, 789, 910, 101]
[111, 222, 333, 444, 555, 666]


In [None]:
from pathlib import Path
import gymnasium as gym
import numpy as np 
import torch
from ray.rllib.core.rl_module import RLModule
from ray.rllib.algorithms.sac import SACConfig
from ray.rllib.models.torch.torch_distributions import TorchDiagGaussian
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

checkpoint_path = "/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/Reach_2025-08-09_16-32-52/reach/SAC_ReachEnvGym_714b7_00000_0_2025-08-09_16-32-53/checkpoint_000283"
rl_module = RLModule.from_checkpoint(
    Path(checkpoint_path)
    / "learner_group"
    / "learner"
    / "rl_module"
    / "default_policy"
)

env = make_reach_env()
obs, info = env.reset()

# print(obs.dtype)
obs_batch = torch.from_numpy(obs.astype(np.float32)).unsqueeze(0)
model_outputs = rl_module.forward_inference({"obs": obs_batch})
print(model_outputs)
logits = model_outputs["action_dist_inputs"]
dist_class = rl_module.get_inference_action_dist_cls()
dist = dist_class.from_logits(logits)
action_sample = dist.sample()
action = action_sample.squeeze(0).detach().numpy().astype(np.float32)
print(action_sample)
print(action)
obs, reward, terminated, truncated, info = env.step(action)

print(f"obs:{obs}")
print(f"reward:{reward}")
print(f"terminated:{terminated}")
print(f"truncated:{truncated}")
print(f"info:{info}")


In [None]:
import datetime
from ray import train, tune, air
from ray.rllib.algorithms.sac import SACConfig
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

config = (
    SACConfig()
    .environment(
        env=ReachEnvSimpleGym,
        env_config={"log_dir": LOGDIR},        
    )
    .training(
        initial_alpha=0.2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        alpha_lr=1e-4,
        target_entropy="auto",
        n_step=1,
        tau=0.005,
        train_batch_size=128,
        target_network_update_freq=1,
        replay_buffer_config={
            "type": "EpisodeReplayBuffer",
            "capacity": 1000000,
            "learning_starts": 1000,
        },
        num_steps_sampled_before_learning_starts=1000,
        model={
            "fcnet_hiddens": [512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "post_fcnet_weights_initializer": "orthogonal_",
            "post_fcnet_weights_initializer_config": {"gain": 0.01},
        },
    )
    .resources(
        num_gpus=0.25,      # 或 0.25 视机器配置
        num_cpus_per_worker=1,
        num_learner_workers=1,
    )
    .framework("torch")
    .reporting(
        metrics_num_episodes_for_smoothing=5,
        min_sample_timesteps_per_iteration=1000,
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_num_env_runners=1,
        evaluation_config={"seed": 42},
    )
    .env_runners(
        num_env_runners=6,             # 进程数量
        num_envs_per_env_runner=1,     # 环境数量
        # gym_env_vectorize_mode="ASYNC"
    )
)


tunner = tune.Tuner(
    trainable=config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        name="reach",
        storage_path=LOGDIR,
        log_to_file=True,
        checkpoint_config=air.CheckpointConfig(
            checkpoint_frequency=10,
            checkpoint_at_end=True,
        ),
        stop={"evaluation/env_runners/episode_return_mean": 18000.0}
    ),
)

results = tunner.fit()

In [None]:
import datetime
from ray import train, tune, air
from ray.rllib.algorithms.sac import SACConfig
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

config = (
    SACConfig()
    .environment(
        env=ReachEnvHERGym,
        env_config={"log_dir": LOGDIR},        
    )
    .training(
        initial_alpha=0.2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        alpha_lr=1e-4,
        target_entropy="auto",
        n_step=1,
        tau=0.005,
        train_batch_size=128,
        target_network_update_freq=1,
        replay_buffer_config={
            "type": "EpisodeReplayBuffer",
            "capacity": 1000000,
        },
        num_steps_sampled_before_learning_starts=1000,
        model={
            "fcnet_hiddens": [512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "post_fcnet_weights_initializer": "orthogonal_",
            "post_fcnet_weights_initializer_config": {"gain": 0.01},
        },
    )
    .resources(
        num_gpus=0.25,      # 或 0.25 视机器配置
        num_cpus_per_worker=1,
        num_learner_workers=1,
    )
    .framework("torch")
    .reporting(
        metrics_num_episodes_for_smoothing=5,
        min_sample_timesteps_per_iteration=1000,
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_num_env_runners=1,
        evaluation_config={"seed": 42},
    )
    .env_runners(
        num_env_runners=6,             # 进程数量
        num_envs_per_env_runner=1,     # 环境数量
        # gym_env_vectorize_mode="ASYNC"
    )
)


tunner = tune.Tuner(
    trainable=config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        name="reach",
        storage_path=LOGDIR,
        log_to_file=True,
        checkpoint_config=air.CheckpointConfig(
            checkpoint_frequency=10,
            checkpoint_at_end=True,
        ),
        stop={"evaluation/env_runners/episode_return_mean": 18000.0}
    ),
)

results = tunner.fit()

0,1
Current time:,2025-08-29 21:20:01
Running for:,05:38:27.95
Memory:,13.4/15.3 GiB

Trial name,status,loc,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
SAC_ReachEnvHERGym_95b0e_00000,RUNNING,127.0.1.1:26461,417,20207.9,200,417000


[36m(pid=26461)[0m E0000 00:00:1756453294.304231   26461 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=26461)[0m E0000 00:00:1756453294.307803   26461 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=26461)[0m W0000 00:00:1756453294.317266   26461 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=26461)[0m W0000 00:00:1756453294.317288   26461 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=26461)[0m W0000 00:00:1756453294.317290   26461 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=26461)

[36m(SingleAgentEnvRunner pid=26592)[0m [chatbus_10] 共享内存不存在，创建成功
[36m(SingleAgentEnvRunner pid=26589)[0m [chatbus_8] 共享内存已存在，连接成功


[36m(SingleAgentEnvRunner pid=26590)[0m Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::SingleAgentEnvRunner.__init__()[39m (pid=26590, ip=127.0.1.1, actor_id=3119fbc62d947db566ebaf7001000000, repr=<ray.rllib.env.single_agent_env_runner.SingleAgentEnvRunner object at 0x7ce17820c2b0>)
[36m(SingleAgentEnvRunner pid=26590)[0m   File "/home/ey/anaconda3/envs/rlreach310/lib/python3.10/multiprocessing/shared_memory.py", line 104, in __init__
[36m(SingleAgentEnvRunner pid=26590)[0m     self._fd = _posixshmem.shm_open(
[36m(SingleAgentEnvRunner pid=26590)[0m FileNotFoundError: [Errno 2] No such file or directory: '/chatbus_8'
[36m(SingleAgentEnvRunner pid=26590)[0m 
[36m(SingleAgentEnvRunner pid=26590)[0m During handling of the above exception, another exception occurred:
[36m(SingleAgentEnvRunner pid=26590)[0m 
[36m(SingleAgentEnvRunner pid=26590)[0m [36mray::SingleAgentEnvRunner.__init__()[39m (pid=26590, ip=127.

[36m(SingleAgentEnvRunner pid=27006)[0m [chatbus_11] 共享内存不存在，创建成功[32m [repeated 3x across cluster][0m
[36m(SingleAgentEnvRunner pid=26593)[0m [chatbus_8] 共享内存已存在，连接成功


[36m(pid=27073)[0m E0000 00:00:1756453313.053189   27073 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=27073)[0m E0000 00:00:1756453313.056679   27073 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=27073)[0m W0000 00:00:1756453313.066198   27073 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.[32m [repeated 4x across cluster][0m
[36m(SingleAgentEnvRunner pid=27006)[0m [1m[32m[robosuite INFO] [0mLoading controller configuration from: ./common/reachController.json (composite_controller_factory.py:121)[32m [repeated 2x across cluster][0m
[36m(SingleAgentEnvRunner pid=27006)[0m   gym.logger.warn([32m [repeated 2x across cluster][0m
[36m(_WrappedExecutable pid=27073)[0m Setting up proc