In [None]:
import datetime
from ray import train, tune, air
from ray.rllib.algorithms.sac import SACConfig
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

config = (
    SACConfig()
    .environment(
        env=ReachEnvGym,
        env_config={"log_dir": LOGDIR},        
    )
    .training(
        initial_alpha=0.2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        alpha_lr=1e-4,
        target_entropy="auto",
        n_step=1,
        tau=0.005,
        train_batch_size=128,
        target_network_update_freq=1,
        replay_buffer_config={
            "type": "EpisodeReplayBuffer",
            "capacity": 1000000,
            "learning_starts": 1000,
            # HER 专用参数
            "replay_mode": "independent",
            "replay_sequence_length": 1,
            "replay_burn_in": 0,
            "replay_zero_init_states": False,
            "storage_unit": "episodes",
            # 关键：HER wrapper 配置
            "wrap_buffer": True,
            "wrapped_buffer": {
                "type": "HindsightExperienceReplayBuffer",
                "replay_mode": "independent",
                "her_strategy": "future",      # 可选: future, final, episode
                "replay_k": 4,                 # 每个 transition 生成多少个 HER 样本
                "goal_fn": None,               # 你可以自定义 goal extraction function
            },
        },
        num_steps_sampled_before_learning_starts=1000,
        model={
            "fcnet_hiddens": [512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "post_fcnet_weights_initializer": "orthogonal_",
            "post_fcnet_weights_initializer_config": {"gain": 0.01},
        },
    )
    .resources(
        num_gpus=0.25,      # 或 0.25 视机器配置
        num_cpus_per_worker=1,
        num_learner_workers=1,
    )
    .framework("torch")
    .reporting(
        metrics_num_episodes_for_smoothing=5,
        min_sample_timesteps_per_iteration=1000,
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_num_env_runners=1,
        evaluation_config={"seed": 42},
    )
    .env_runners(
        num_env_runners=5,             # 进程数量
        num_envs_per_env_runner=1,     # 环境数量
        # gym_env_vectorize_mode="ASYNC"
    )
)


tunner = tune.Tuner(
    trainable=config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        name="reach",
        storage_path=LOGDIR,
        log_to_file=True,
        checkpoint_config=air.CheckpointConfig(
            checkpoint_frequency=10,
            checkpoint_at_end=True,
        ),
        stop={"evaluation/env_runners/episode_return_mean": 12000.0}
    ),
)

results = tunner.fit()

0,1
Current time:,2025-08-14 08:59:11
Running for:,18:58:39.64
Memory:,13.2/15.3 GiB

Trial name,status,loc,iter,total time (s),num_training_step_ca lls_per_iteration,num_env_steps_sample d_lifetime
SAC_ReachEnvGym_d1f2c_00000,RUNNING,192.168.202.162:6776,1502,67633.7,167,1505000.0


[36m(pid=6776)[0m E0000 00:00:1755064832.440782    6776 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=6776)[0m E0000 00:00:1755064832.444170    6776 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=6776)[0m W0000 00:00:1755064832.453127    6776 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=6776)[0m W0000 00:00:1755064832.453145    6776 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=6776)[0m W0000 00:00:1755064832.453147    6776 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=6776)[0m W0

[36m(SingleAgentEnvRunner pid=6891)[0m [chatbus_1] 共享内存不存在，创建成功
[36m(SingleAgentEnvRunner pid=6895)[0m [chatbus_1] 共享内存已存在，连接成功


[36m(SingleAgentEnvRunner pid=6891)[0m   gym.logger.warn(
[36m(SingleAgentEnvRunner pid=6891)[0m   gym.logger.warn(
[36m(SAC pid=6776)[0m [2025-08-13 14:00:44,693 E 6776 6776] core_worker.cc:2740: Actor with class name: 'SingleAgentEnvRunner' and ID: '0c4fd437f9ce492ad442d2b701000000' has constructor arguments in the object store and max_restarts > 0. If the arguments in the object store go out of scope or are lost, the actor restart will fail. See https://github.com/ray-project/ray/issues/53727 for more details.
[36m(pid=7302)[0m E0000 00:00:1755064845.464260    7302 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=7302)[0m E0000 00:00:1755064845.467624    7302 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=7302)[0m W0000 00:00:1755064845.476245    7302 computation_placer.cc:17

[36m(SingleAgentEnvRunner pid=7302)[0m [chatbus_3] 共享内存不存在，创建成功[32m [repeated 2x across cluster][0m
[36m(SingleAgentEnvRunner pid=6901)[0m [chatbus_1] 共享内存已存在，连接成功[32m [repeated 3x across cluster][0m


[36m(pid=7372)[0m E0000 00:00:1755064851.882582    7372 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=7372)[0m E0000 00:00:1755064851.886191    7372 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=7372)[0m W0000 00:00:1755064851.895047    7372 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.[32m [repeated 4x across cluster][0m
[36m(SingleAgentEnvRunner pid=7302)[0m [1m[32m[robosuite INFO] [0mLoading controller configuration from: ./common/reachController.json (composite_controller_factory.py:121)[32m [repeated 2x across cluster][0m
[36m(SingleAgentEnvRunner pid=7302)[0m   gym.logger.warn([32m [repeated 2x across cluster][0m
[36m(_WrappedExecutable pid=7372)[0m Setting up process gr

In [None]:
from pathlib import Path
import gymnasium as gym
import numpy as np 
import torch
from ray.rllib.core.rl_module import RLModule
from ray.rllib.algorithms.sac import SACConfig
from ray.rllib.models.torch.torch_distributions import TorchDiagGaussian
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

checkpoint_path = "/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/Reach_2025-08-09_16-32-52/reach/SAC_ReachEnvGym_714b7_00000_0_2025-08-09_16-32-53/checkpoint_000283"
rl_module = RLModule.from_checkpoint(
    Path(checkpoint_path)
    / "learner_group"
    / "learner"
    / "rl_module"
    / "default_policy"
)

env = make_reach_env()
obs, info = env.reset()

# print(obs.dtype)
obs_batch = torch.from_numpy(obs.astype(np.float32)).unsqueeze(0)
model_outputs = rl_module.forward_inference({"obs": obs_batch})
print(model_outputs)
logits = model_outputs["action_dist_inputs"]
dist_class = rl_module.get_inference_action_dist_cls()
dist = dist_class.from_logits(logits)
action_sample = dist.sample()
action = action_sample.squeeze(0).detach().numpy().astype(np.float32)
print(action_sample)
print(action)
obs, reward, terminated, truncated, info = env.step(action)

print(f"obs:{obs}")
print(f"reward:{reward}")
print(f"terminated:{terminated}")
print(f"truncated:{truncated}")
print(f"info:{info}")


In [None]:
import datetime
from ray import train, tune, air
from ray.rllib.algorithms.sac import SACConfig
from common.envUtils import *

TASK="Reach_"
experiment_name = TASK + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
LOGDIR=f"/home/ey/rl/src/rlreach2/rlreach/ray/db/ray_results/{experiment_name}"

config = (
    SACConfig()
    .environment(
        env=ReachEnvSimpleGym,
        env_config={"log_dir": LOGDIR},        
    )
    .training(
        initial_alpha=0.2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        alpha_lr=1e-4,
        target_entropy="auto",
        n_step=1,
        tau=0.005,
        train_batch_size=128,
        target_network_update_freq=1,
        replay_buffer_config={
            "type": "EpisodeReplayBuffer",
            "capacity": 1000000,
            "learning_starts": 1000,
            # HER 专用参数
            "replay_mode": "independent",
            "replay_sequence_length": 1,
            "replay_burn_in": 0,
            "replay_zero_init_states": False,
            "storage_unit": "episodes",
            # 关键：HER wrapper 配置
            "wrap_buffer": True,
            "wrapped_buffer": {
                "type": "HindsightExperienceReplayBuffer",
                "replay_mode": "independent",
                "her_strategy": "future",      # 可选: future, final, episode
                "replay_k": 4,                 # 每个 transition 生成多少个 HER 样本
                "goal_fn": None,               # 你可以自定义 goal extraction function
            },
        },
        num_steps_sampled_before_learning_starts=1000,
        model={
            "fcnet_hiddens": [512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "post_fcnet_weights_initializer": "orthogonal_",
            "post_fcnet_weights_initializer_config": {"gain": 0.01},
        },
    )
    .resources(
        num_gpus=0.25,      # 或 0.25 视机器配置
        num_cpus_per_worker=1,
        num_learner_workers=1,
    )
    .framework("torch")
    .reporting(
        metrics_num_episodes_for_smoothing=5,
        min_sample_timesteps_per_iteration=1000,
    )
    .evaluation(
        evaluation_interval=1,
        evaluation_num_env_runners=1,
        evaluation_config={"seed": 42},
    )
    .env_runners(
        num_env_runners=6,             # 进程数量
        num_envs_per_env_runner=1,     # 环境数量
        # gym_env_vectorize_mode="ASYNC"
    )
)


tunner = tune.Tuner(
    trainable=config.algo_class,
    param_space=config,
    run_config=train.RunConfig(
        name="reach",
        storage_path=LOGDIR,
        log_to_file=True,
        checkpoint_config=air.CheckpointConfig(
            checkpoint_frequency=10,
            checkpoint_at_end=True,
        ),
        stop={"evaluation/env_runners/episode_return_mean": 10000.0}
    ),
)

results = tunner.fit()

0,1
Current time:,2025-08-13 09:48:54
Running for:,00:00:45.48
Memory:,13.2/15.3 GiB

Trial name,status,loc
SAC_ReachEnvSimpleGym_90614_00000,RUNNING,192.168.202.162:310693


[36m(pid=310693)[0m E0000 00:00:1755049690.077845  310693 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=310693)[0m E0000 00:00:1755049690.081397  310693 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=310693)[0m W0000 00:00:1755049690.091002  310693 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=310693)[0m W0000 00:00:1755049690.091028  310693 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=310693)[0m W0000 00:00:1755049690.091030  310693 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
[36m(pid=31

[36m(SingleAgentEnvRunner pid=310808)[0m [chatbus_3] 共享内存不存在，创建成功
[36m(SingleAgentEnvRunner pid=310811)[0m [chatbus_3] 共享内存已存在，连接成功


[36m(SingleAgentEnvRunner pid=310810)[0m Exception raised in creation task: The actor died because of an error raised in its creation task, [36mray::SingleAgentEnvRunner.__init__()[39m (pid=310810, ip=192.168.202.162, actor_id=6251ab3e8e9da41fb9f4973201000000, repr=<ray.rllib.env.single_agent_env_runner.SingleAgentEnvRunner object at 0x7dec5e7242e0>)
[36m(SingleAgentEnvRunner pid=310810)[0m   File "/home/ey/anaconda3/envs/rlreach310/lib/python3.10/multiprocessing/shared_memory.py", line 104, in __init__
[36m(SingleAgentEnvRunner pid=310810)[0m     self._fd = _posixshmem.shm_open(
[36m(SingleAgentEnvRunner pid=310810)[0m FileNotFoundError: [Errno 2] No such file or directory: '/chatbus_3'
[36m(SingleAgentEnvRunner pid=310810)[0m 
[36m(SingleAgentEnvRunner pid=310810)[0m During handling of the above exception, another exception occurred:
[36m(SingleAgentEnvRunner pid=310810)[0m 
[36m(SingleAgentEnvRunner pid=310810)[0m [36mray::SingleAgentEnvRunner.__init__()[39m (pid

[36m(SingleAgentEnvRunner pid=311140)[0m [chatbus_5] 共享内存不存在，创建成功[32m [repeated 2x across cluster][0m
[36m(SingleAgentEnvRunner pid=310809)[0m [chatbus_3] 共享内存已存在，连接成功


[36m(pid=311210)[0m E0000 00:00:1755049708.914570  311210 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=311210)[0m E0000 00:00:1755049708.918280  311210 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=311210)[0m W0000 00:00:1755049708.927947  311210 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.[32m [repeated 4x across cluster][0m
[36m(SingleAgentEnvRunner pid=311140)[0m [1m[32m[robosuite INFO] [0mLoading controller configuration from: ./common/reachController.json (composite_controller_factory.py:121)[32m [repeated 2x across cluster][0m
[36m(SingleAgentEnvRunner pid=311140)[0m   gym.logger.warn([32m [repeated 2x across cluster][0m
[36m(_WrappedExecutable pid=311210)[0m Setting u