# 0.Register
the env_cfg and agent_cfg is how to convey in this framework

- 首先要有个概念：我们的env_cfg继承了DirectRLEnvCfg，agent_cfg继承了RslRlOnPolicyRunnerCfg，也就继承了一些方法（函数）和一些默认属性property。当然也有些方法只有声明，没有实现implement，例如the '_get_observations' method就是在我们的env_cfg中具体实现的。

- 然后，由env_cfg创建的env又通过RslRlVecEnvWrapper()进行了wrap
- 最后传入了OnPolicyRunner()

scripts/rsl_rl/train.py
```python
@hydra_task_config(args_cli.task, "rsl_rl_cfg_entry_point")
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg):
    # ...
    # create isaac environment
    env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
    # ...
    # wrap around environment for rsl-rl
    env = RslRlVecEnvWrapper(env)
    # create runner from rsl-rl
    runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)
    # ...
    # run training
    runner.learn(num_learning_iterations=agent_cfg.max_iterations, init_at_random_ep_len=True)
    # close the simulator
    env.close()
```

↓decorator 负责解析并提供 env_cfg 和 agent_cfg 

```python
def hydra_task_config(task_name: str, agent_cfg_entry_point: str) -> Callable:
    #...
    env_cfg, agent_cfg = register_task_to_hydra(task_name, agent_cfg_entry_point)
    #...
```

↓

```python
def register_task_to_hydra(
    task_name: str, agent_cfg_entry_point: str
) -> tuple[ManagerBasedRLEnvCfg | DirectRLEnvCfg, dict]：
    # ...
    env_cfg = load_cfg_from_registry(task_name, "env_cfg_entry_point")
    agent_cfg = load_cfg_from_registry(task_name, agent_cfg_entry_point)
    # ...
```

↓

```python
def load_cfg_from_registry(task_name: str, entry_point_key: str) -> dict | object:
    """It supports both YAML and Python configuration files.
    If the entry point is a YAML file, it is parsed into a dictionary.
    If the entry point is a Python class, it is instantiated and returned."""
    # obtain the configuration entry point
    cfg_entry_point = gym.spec(task_name).kwargs.get(entry_point_key)
    # 如果 cfg_entry_point 是一个以 .yaml 结尾的字符串
    # 如果 cfg_entry_point 是可调用的（例如是一个函数或类）
    # 如果 cfg_entry_point 是字符串（格式为 "module_name:attr_name"）
    # ...
```

.e.g例如
```python
import gymnasium as gym
from . import agents
from .zbot6b_env_v0 import ZbotBEnv, ZbotBEnvCfg
##
# Register Gym environments.
##
gym.register(
    id="Zbot-6b-walking-v0",
    entry_point="Zbot.tasks.moving.zbot6b_direct:ZbotBEnv",
    disable_env_checker=True,
    kwargs={
        "env_cfg_entry_point": ZbotBEnvCfg, 
        "rsl_rl_cfg_entry_point": f"{agents.__name__}.rsl_rl_ppo_cfg:ZbotSBFlatPPORunnerCfg",
    },
)
```

# 1.log_dir

scripts/rsl_rl/train.py
```python
def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agent_cfg: RslRlOnPolicyRunnerCfg):
    # ...
    # specify directory for logging experiments
    log_root_path = os.path.join("logs", "rsl_rl", agent_cfg.experiment_name)
    log_root_path = os.path.abspath(log_root_path)
    print(f"[INFO] Logging experiment in directory: {log_root_path}")
    # specify directory for logging runs: {time-stamp}_{run_name}
    log_dir = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    if agent_cfg.run_name:
        log_dir += f"_{agent_cfg.run_name}"
    log_dir = os.path.join(log_root_path, log_dir)
    # ...
```

# 2.OnPolicyRunner
runner = OnPolicyRunner(env, agent_cfg.to_dict(), log_dir=log_dir, device=agent_cfg.device)

## 2.1

```python
class RslRlOnPolicyRunnerCfg:
    """Configuration of the runner for on-policy algorithms."""

    seed: int = 42
    """The seed for the experiment. Default is 42."""

    device: str = "cuda:0"
    """The device for the rl-agent. Default is cuda:0."""
```

所以 agent_cfg.device = "cuda:0"

## 2.2
line 29 `obs, extras = self.env.get_observations()`其中get_observations()这个attribute

是由于train.py中`env = RslRlVecEnvWrapper(env)`进行了wrap

```python
# ~/IsaacLab/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/utils/wrappers/rsl_rl/vecenv_wrapper.py
class RslRlVecEnvWrapper(VecEnv):
    # ...
    def get_observations(self) -> tuple[torch.Tensor, dict]:
        """Returns the current observations of the environment."""
        if hasattr(self.unwrapped, "observation_manager"):
            obs_dict = self.unwrapped.observation_manager.compute()
        else:
            obs_dict = self.unwrapped._get_observations()
        return obs_dict["policy"], {"observations": obs_dict}
    # ...
```

.e.g例如
```python
class ZbotBEnv(DirectRLEnv):
    cfg: ZbotBEnvCfg
    def __init__(self, cfg: ZbotBEnvCfg, render_mode: str | None = None, **kwargs):
        super().__init__(cfg, render_mode, **kwargs)
        # ...
    def _get_observations(self) -> dict:
        obs = torch.cat(
            (
                self.body_quat[:,0].reshape(self.scene.cfg.num_envs, -1),
                self.body_quat[:,3].reshape(self.scene.cfg.num_envs, -1),
                self.body_quat[:,6].reshape(self.scene.cfg.num_envs, -1),
                self._commands,
                self.joint_vel,
                self.joint_pos,
                # 4*(3)+3+6+6
            ),
            dim=-1,
        )
        observations = {"policy": obs}
        return observations
```

## 2.3
line 112 `obs, rewards, dones, infos = self.env.step(actions)`使用四个参数接收返回值，也是因为`RslRlVecEnvWrapper()`

原本有5个返回值
```python
class DirectRLEnv(gym.Env):
    # ...
    def step(self, action: torch.Tensor) -> VecEnvStepReturn:
        """Returns:
            A tuple containing the observations, rewards, resets (terminated and truncated) and extras.
        """
        # ...
        # return observations, rewards, resets and extras
        return self.obs_buf, self.reward_buf, self.reset_terminated, self.reset_time_outs, self.extras
    # ...
```

```python
class RslRlVecEnvWrapper(VecEnv):
    #...
    def step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
        # record step information
        obs_dict, rew, terminated, truncated, extras = self.env.step(actions)
        # compute dones for compatibility with RSL-RL
        dones = (terminated | truncated).to(dtype=torch.long)
        # move extra observations to the extras dict
        obs = obs_dict["policy"]
        extras["observations"] = obs_dict
        # move time out information to the extras dict
        # this is only needed for infinite horizon tasks
        if not self.unwrapped.cfg.is_finite_horizon:
            extras["time_outs"] = truncated

        # return the step information
        return obs, rew, dones, extras
    # ...
```

# 3.Extras & Tensorboard logging

```python
class our_env_cfg(DirectRLEnvCfg)
```

```python
class DirectRLEnv(gym.Env):
    # ...
    def __init__(self, cfg: DirectRLEnvCfg, render_mode: str | None = None, **kwargs):
        # ...
        # allocate dictionary to store metrics
        self.extras = {}
        # ...
    def reset(self, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[VecEnvObs, dict]:
        # ...
        # reset state of scene
        indices = torch.arange(self.num_envs, dtype=torch.int64, device=self.device)
        self._reset_idx(indices)  # 传入indices有点奇怪，不知道哪里会使用这个reset函数，后面发现被RslRlVecEnvWrapper()重写了
        # ...
        # return observations
        return self._get_observations(), self.extras
    def step(self, action: torch.Tensor) -> VecEnvStepReturn:
        """Returns:
            A tuple containing the observations, rewards, resets (terminated and truncated) and extras.
        """
        # ...
        # return observations, rewards, resets and extras
        return self.obs_buf, self.reward_buf, self.reset_terminated, self.reset_time_outs, self.extras
    # ...
    def _reset_idx(self, env_ids: Sequence[int]):
        """Reset environments based on specified indices.
        """
        self.scene.reset(env_ids)
        # ...
    # ...
```

```python
# ~/IsaacLab/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/utils/wrappers/rsl_rl/vecenv_wrapper.py
class RslRlVecEnvWrapper(VecEnv):
    # ...
    def reset(self) -> tuple[torch.Tensor, dict]:  # noqa: D102  # 忽略缺少公共方法文档字符串的警告
        # reset the environment
        obs_dict, _ = self.env.reset()  # 前面定义的extra被丢弃了
        # return observations
        return obs_dict["policy"], {"observations": obs_dict}
    #...

```

`new_ids = (dones > 0).nonzero(as_tuple=False)`，`as_tuple=False`表示返回结果是一个二维张量，而不是一个元组。

对于一维张量，nonzero 返回的张量的形状是 (N, 1)，其中 N 是非零元素的数量。
```python
print(cur_reward_sum.shape)  # torch.Size([64])
print(new_ids.shape)  # torch.Size([2, 1])
print(new_ids)  # tensor([[1], [3]], device='cuda:0')
print(cur_reward_sum[new_ids].shape)  # torch.Size([2, 1])  # 升维了
```

#### 高级索引（Advanced Indexing）

高级索引允许你使用整数数组来选择张量中的元素。高级索引的行为与基本切片（basic slicing）不同，它会返回一个与**索引数组形状相同的新张量**。

In [4]:
import torch
# 初始化 A 和 new_ids
A = torch.randn(64, dtype=torch.float, device='cuda')
print(A.shape)  # 输出: torch.Size([64])
new_ids = torch.tensor([[1], [3], [6], [8]], dtype=torch.long, device='cuda')
# 计算 A[new_ids]
result = A[new_ids]
print(result)
print(result.shape)  # 输出: torch.Size([4, 1])

new_ids_2 = torch.tensor([[1, 3], [6, 8], [6, 8], [6, 8]], dtype=torch.long, device='cuda')
result_2 = A[new_ids_2]
print(result_2)
print(result_2.shape)  # 输出: torch.Size([4, 2])

torch.Size([64])
tensor([[-2.1058],
        [ 0.1667],
        [ 1.7387],
        [-0.5361]], device='cuda:0')
torch.Size([4, 1])
tensor([[-2.1058,  0.1667],
        [ 1.7387, -0.5361],
        [ 1.7387, -0.5361],
        [ 1.7387, -0.5361]], device='cuda:0')
torch.Size([4, 2])


In [1]:
from collections import deque
import statistics

rewbuffer = deque(maxlen=100)
rewbuffer.extend([1, 1, 1, 1])
rewbuffer.extend([1, 1, 1, 1, 1])
rewbuffer.extend([1, 1, 1])
print(rewbuffer)
statistics.mean(rewbuffer)

deque([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], maxlen=100)


1

### 迭代器对象和可迭代对象的区别

在 Python 中，**迭代器对象** 和 **可迭代对象** 是两个不同的概念。

#### 1. 可迭代对象（Iterable）
- **定义**：可迭代对象是实现了 `__iter__()` 方法的对象。这个方法返回一个迭代器对象。
- **特点**：
  - 可以通过 `for` 循环遍历。
  - 可以传递给内置函数 `iter()` 来获取迭代器对象。

- **常见类型**：
  - 列表 (`list`)
  - 元组 (`tuple`)
  - 字符串 (`str`)
  - 字典 (`dict`)
  - 集合 (`set`)
  - 文件对象
  - 自定义的可迭代类

**示例**：
```python
# 列表是一个可迭代对象
my_list = [1, 2, 3]
for item in my_list:
    print(item)

# 使用 iter() 获取迭代器对象
iterator = iter(my_list)  # 大多数情况下，iter(my_list) 等价于 my_list.__iter__()
print(next(iterator))  # 输出 1
print(next(iterator))  # 输出 2
```

#### 2. 迭代器对象（Iterator）
- **定义**：迭代器对象是实现了 `__next__()` 方法的对象。每次调用 `next()` 函数时，它会返回下一个值，直到没有更多元素为止，此时会抛出 `StopIteration` 异常。

- **特点**：
  - 必须实现 `__iter__()` 方法，返回自身（即 `self`）。
  - 每次调用 `next()` 返回序列中的下一个值。
  - 一旦遍历结束，再次调用 `next()` 会抛出 `StopIteration` 异常。
- **常见类型**：
  - 由 `iter()` 函数从可迭代对象生成的对象。
  - 生成器（Generator）

**示例**：
```python
# 创建一个迭代器对象
my_iterator = iter([1, 2, 3])

# 使用 next() 获取下一个值
print(next(my_iterator))  # 输出 1
print(next(my_iterator))  # 输出 2
print(next(my_iterator))  # 输出 3
# print(next(my_iterator))  # 抛出 StopIteration 异常
```

#### 3. 区别与联系
- **可迭代对象** 是可以被迭代的对象，但不一定能直接调用 `next()` 方法。
- **迭代器对象** 是专门用于迭代的对象，可以直接调用 `next()` 方法。
- **关系**：每个可迭代对象都可以通过 `iter()` 函数转换为迭代器对象。迭代器对象本身也是一个可迭代对象，因为它实现了 `__iter__()` 方法并返回自身。

#### 4. 特殊情况：`if iter(data) is data`
- **解释**：这行代码检查 `data` 是否既是可迭代对象又是迭代器对象，并且 `iter(data)` 返回的就是 `data` 本身。
- **应用场景**：通常用于判断 `data` 是否已经是迭代器对象，而不是普通的可迭代对象。例如，在某些情况下，你可能需要传入可迭代对象作为参数，避免重复创建迭代器对象。

h