In [1]:
import numpy as np
from pathlib import Path
import random
import json

# 查找数据集路径
lerobot_cache = Path.home() / ".cache" / "huggingface" / "lerobot"
print("=" * 80)
print("查找数据集")
print("=" * 80)
print(f"LeRobot 缓存目录: {lerobot_cache}")
print(f"目录存在: {lerobot_cache.exists()}")

# 查找所有可能的数据集目录
if lerobot_cache.exists():
    dataset_dirs = list(lerobot_cache.glob("*/*pick_blue_bottle*"))
    if not dataset_dirs:
        # 尝试查找所有目录
        all_dirs = [d for d in lerobot_cache.iterdir() if d.is_dir()]
        print(f"\n找到 {len(all_dirs)} 个目录:")
        for d in all_dirs[:10]:
            print(f"  - {d.name}")
            # 检查子目录
            subdirs = [sd for sd in d.iterdir() if sd.is_dir()]
            for sd in subdirs[:5]:
                if "pick" in sd.name.lower() or "blue" in sd.name.lower() or "bottle" in sd.name.lower():
                    dataset_dirs.append(sd)
                    print(f"    -> {sd.name}")
    
    if dataset_dirs:
        dataset_path = dataset_dirs[0]
        print(f"\n✅ 找到数据集: {dataset_path}")
    else:
        # 使用默认路径
        dataset_repo_id = "your_hf_username/pick_blue_bottle_libero_downsample4x"
        dataset_path = lerobot_cache / dataset_repo_id
        print(f"\n使用默认路径: {dataset_path}")
else:
    dataset_repo_id = "your_hf_username/pick_blue_bottle_libero_downsample4x"
    dataset_path = lerobot_cache / dataset_repo_id

print(f"\n数据集路径: {dataset_path}")
print(f"路径存在: {dataset_path.exists()}")

if not dataset_path.exists():
    print(f"\n❌ 数据集路径不存在，请检查路径或先运行转换脚本")
    print(f"   预期路径: {dataset_path}")


查找数据集
LeRobot 缓存目录: /home/wyz/.cache/huggingface/lerobot
目录存在: True

✅ 找到数据集: /home/wyz/.cache/huggingface/lerobot/your_hf_username/pick_blue_bottle_libero_downsample4x_no_nan

数据集路径: /home/wyz/.cache/huggingface/lerobot/your_hf_username/pick_blue_bottle_libero_downsample4x_no_nan
路径存在: True


In [2]:
# 尝试多种方式加载数据集
import h5py
import pyarrow.parquet as pq

print("\n" + "=" * 80)
print("尝试加载数据集")
print("=" * 80)

dataset = None
total_frames = 0

# 方法1: 检查是否有 parquet 文件
parquet_files = list(dataset_path.glob("**/*.parquet"))
if parquet_files:
    print(f"\n✅ 找到 {len(parquet_files)} 个 parquet 文件")
    try:
        # 读取第一个 parquet 文件来获取结构
        parquet_file = parquet_files[0]
        print(f"  读取: {parquet_file.name}")
        table = pq.read_table(parquet_file)
        print(f"  列: {table.column_names}")
        print(f"  行数: {len(table)}")
        
        # 计算总行数
        total_frames = sum(len(pq.read_table(f)) for f in parquet_files)
        print(f"  总帧数: {total_frames}")
        
        # 使用 parquet 文件作为数据源
        dataset = parquet_files
        dataset_type = "parquet"
        
    except Exception as e:
        print(f"  ⚠️  读取 parquet 失败: {e}")

# 方法2: 检查是否有 data 目录（LeRobot 格式）
if dataset is None:
    data_dir = dataset_path / "data"
    if data_dir.exists():
        print(f"\n✅ 找到 data 目录: {data_dir}")
        # 检查 meta.json
        meta_file = dataset_path / "meta.json"
        if meta_file.exists():
            with open(meta_file, 'r') as f:
                meta = json.load(f)
            print(f"  元数据: {list(meta.keys())}")
            if 'total_frames' in meta:
                total_frames = meta['total_frames']
                print(f"  总帧数: {total_frames}")
        
        # 查找 parquet 文件
        parquet_files = list(data_dir.glob("*.parquet"))
        if parquet_files:
            dataset = parquet_files
            dataset_type = "parquet"
            print(f"  找到 {len(parquet_files)} 个 parquet 文件")

# 方法3: 直接使用 datasets 库
if dataset is None:
    try:
        from datasets import load_from_disk
        hf_dataset = load_from_disk(str(dataset_path))
        dataset = hf_dataset
        dataset_type = "huggingface"
        total_frames = len(hf_dataset)
        print(f"\n✅ 使用 HuggingFace datasets 加载成功")
        print(f"  总帧数: {total_frames}")
        print(f"  特征: {list(hf_dataset.features.keys())}")
    except Exception as e:
        print(f"\n❌ HuggingFace datasets 加载失败: {e}")

if dataset is None:
    raise FileNotFoundError(f"无法加载数据集，请检查路径: {dataset_path}")

print(f"\n✅ 数据集加载成功 (类型: {dataset_type})")
print(f"  总帧数: {total_frames}")



尝试加载数据集

✅ 找到 1 个 parquet 文件
  读取: episode_000000.parquet
  列: ['image', 'wrist_image', 'state', 'actions', 'timestamp', 'frame_index', 'episode_index', 'index', 'task_index']
  行数: 47
  总帧数: 47

✅ 数据集加载成功 (类型: parquet)
  总帧数: 47


随机抽取 10 个样本
样本索引: [1, 5, 6, 7, 8, 14, 15, 17, 34, 40]



读取样本数据...
  索引     1: state=shape=(8,), action=shape=(8,)
  索引     5: state=shape=(8,), action=shape=(8,)
  索引     6: state=shape=(8,), action=shape=(8,)
  索引     7: state=shape=(8,), action=shape=(8,)
  索引     8: state=shape=(8,), action=shape=(8,)
  索引    14: state=shape=(8,), action=shape=(8,)
  索引    15: state=shape=(8,), action=shape=(8,)
  索引    17: state=shape=(8,), action=shape=(8,)
  索引    34: state=shape=(8,), action=shape=(8,)
  索引    40: state=shape=(8,), action=shape=(8,)

✅ 成功读取 10 个样本


保存数据到: sampled_state_action.txt
✅ 数据已保存到: sampled_state_action.txt
   文件大小: 8.00 KB


样本 1 的完整数据

样本索引: 1

--------------------------------------------------------------------------------
State (8维):
--------------------------------------------------------------------------------
Shape: (8,)
完整数据:
  [   -1.877049,    -1.267971,     1.561950,    -2.262064, 
       0.473075,     0.660839,    -0.478947,     0.000000]

各维度说明:
  维度 0-6: 右臂关节位置 (7维)
  维度 7:   夹爪状态 (原始值，来自 /gripper/feedback_R 第一维)

各维度数值:
  维度 0 (关节 1):    -1.877049 rad
  维度 1 (关节 2):    -1.267971 rad
  维度 2 (关节 3):     1.561950 rad
  维度 3 (关节 4):    -2.262064 rad
  维度 4 (关节 5):     0.473075 rad
  维度 5 (关节 6):     0.660839 rad
  维度 6 (关节 7):    -0.478947 rad
  维度 7 (夹爪状态):     0.000000

统计信息:
  Min:  -2.262064
  Max:  1.561950
  Mean: -0.398771
  Std:  1.240234

--------------------------------------------------------------------------------
Action (7维):
--------------------------------------------------------------------------------
Shape: (8,)
完整数据:
  [    0.041484,     0.000205,    -0.025672,    -0.008512, 

In [6]:
# 显示前几个样本的简要信息
print("\n" + "=" * 80)
print("样本预览（前3个）:")
print("=" * 80)

for i, sample in enumerate(samples[:3]):
    idx = sample['index']
    state = sample['state']
    action = sample['action']
    
    print(f"\n样本 {i+1} (索引: {idx}):")
    if state is not None:
        print(f"  State: shape={state.shape}, 前5个值={state[:5] if len(state) >= 5 else state}")
    if action is not None:
        print(f"  Action: shape={action.shape}, 前5个值={action[:5] if len(action) >= 5 else action}")



样本预览（前3个）:

样本 1 (索引: 1):
  State: shape=(8,), 前5个值=[-1.87704861 -1.26797056  1.56194961 -2.26206422  0.47307548]
  Action: shape=(8,), 前5个值=[ 0.04148385  0.00020473 -0.02567201 -0.00851239  0.0267422 ]

样本 2 (索引: 5):
  State: shape=(8,), 前5个值=[-1.84782875 -1.26513577  1.55723953 -2.19229388  0.48032188]
  Action: shape=(8,), 前5个值=[ 0.09608921  0.01855343 -0.05517669  0.29414982 -0.00068849]

样本 3 (索引: 6):
  State: shape=(8,), 前5个值=[-1.82891464 -1.26391304  1.54785204 -2.14456463  0.47791913]
  Action: shape=(8,), 前5个值=[ 0.20930164  0.0090403  -0.09230933  0.44075486 -0.02045408]
