In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.signal import welch
import os

# 设置采样率和时间参数
FS = 1000  # 采样率 1000Hz
CYCLE_DURATION = 10  # 每个周期 10 秒
SKIP_SECONDS = 4  # 跳过前 4 秒
USE_SECONDS = 6  # 选取后 6 秒
CYCLE_SAMPLES = FS * CYCLE_DURATION  # 10 秒总样本数 = 10000
SKIP_SAMPLES = FS * SKIP_SECONDS  # 4 秒跳过样本数 = 4000
USE_SAMPLES = FS * USE_SECONDS  # 6 秒有效数据样本数 = 6000

# 滑动窗口参数
WINDOW_SIZE = 200  # 200ms = 200 采样点
STEP_SIZE = 100  # 100ms = 100 采样点
NUM_WINDOWS = (USE_SAMPLES - WINDOW_SIZE) // STEP_SIZE + 1  # 每 6s 生成的窗口数

# 15 种特征名称
FEATURE_NAMES = ["VAR", "MAV", "RMS", "SDV", "AAC", "MAX", "SSC", "ZCR",
                 "KUR", "SKW", "WWL", "SSI", "LGD", "ARC", "MFR"]

def extract_features(window):
    """
    计算单个窗口的 15 种特征，每个通道分别计算。
    输入：
        - window: (200, num_channels) 的 NumPy 数组
    输出：
        - features: (15, num_channels) 的 NumPy 数组
    """
    num_channels = window.shape[1]
    features = np.zeros((15, num_channels))

    for ch in range(num_channels):
        signal = window[:, ch]

        # 时域特征
        features[0, ch] = np.var(signal)  # 方差 VAR
        features[1, ch] = np.mean(np.abs(signal))  # 平均绝对值 MAV
        features[2, ch] = np.sqrt(np.mean(signal**2))  # 均方根 RMS
        features[3, ch] = np.std(signal)  # 标准差 SDV
        features[4, ch] = np.mean(np.abs(np.diff(signal)))  # 平均绝对变化率 AAC
        features[5, ch] = np.max(signal)  # 最大值 MAX
        features[6, ch] = np.sum(np.diff(signal) > 0)  # 符号变化数 SSC
        features[7, ch] = np.sum(np.diff(np.sign(signal)) != 0)  # 过零率 ZCR
        features[8, ch] = stats.kurtosis(signal)  # 峭度 KUR
        features[9, ch] = stats.skew(signal)  # 偏度 SKW
        features[10, ch] = np.sum(np.abs(np.diff(signal)))  # 波形长度 WWL
        features[11, ch] = np.sum(signal ** 2)  # 积分平方 SSI
        features[12, ch] = np.log10(np.mean(signal**2) + 1e-10)  # 对数能量 LGD

        # 频域特征
        freqs, psd = welch(signal, fs=FS, nperseg=200)
        features[13, ch] = np.sum(psd)  # 频谱面积 ARC
        features[14, ch] = np.sum(freqs * psd) / (np.sum(psd) + 1e-10)  # 频率质心 MFR

    return features

def process_emg_file(file_path):
    """
    处理单个 CSV 文件，返回特征矩阵和标签。
    输入：
        - file_path: CSV 文件路径
    输出：
        - features_array: (num_cycles, num_windows, 15, num_channels) 的 NumPy 数组
        - labels: (num_cycles,) 的 NumPy 数组，表示该文件的编号
    """
    raw_data = pd.read_csv(file_path)
    num_channels = raw_data.shape[1] - 1  # 去掉时间列
    print(f"Processing {file_path}, Detected {num_channels} channels")

    # 计算完整的 10s 周期数
    num_cycles = len(raw_data) // CYCLE_SAMPLES
    segments = []

    for i in range(num_cycles):
        start_idx = i * CYCLE_SAMPLES + SKIP_SAMPLES
        end_idx = start_idx + USE_SAMPLES

        if end_idx > len(raw_data):  # 处理不足 6000 采样点的情况
            segment = raw_data.iloc[start_idx:].values
            pad_size = USE_SAMPLES - len(segment)
            segment = np.pad(segment, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)
        else:
            segment = raw_data.iloc[start_idx:end_idx].values

        # 滑动窗口
        windows = np.array([
            segment[j:j + WINDOW_SIZE, 1:]  # 取 200 采样点，去掉时间列
            for j in range(0, USE_SAMPLES - WINDOW_SIZE + 1, STEP_SIZE)
        ])
        segments.append(windows)

    segments_array = np.array(segments)  # (num_cycles, num_windows, 200, num_channels)
    print(f"Segments shape: {segments_array.shape}")

    # 计算特征
    features_batches = np.array([
        np.array([extract_features(window) for window in cycle])  # (num_windows, 15, num_channels)
        for cycle in segments_array
    ])

    return features_batches, np.full(features_batches.shape[0], int(file_path.split("/")[-1].split(".")[0]))

def process_emg_folder(input_folder, output_folder):
    """
    处理文件夹中的所有 EMG 数据文件，保存特征矩阵和标签。
    """
    os.makedirs(output_folder, exist_ok=True)
    all_features, all_labels = [], []

    csv_files = sorted([f for f in os.listdir(input_folder) if f.endswith(".csv")])
    for idx, file_name in enumerate(csv_files):
        file_path = os.path.join(input_folder, file_name)
        features, labels = process_emg_file(file_path)
        all_features.append(features)
        all_labels.append(labels)

    # 转换为 NumPy 数组并保存
    all_features = np.vstack(all_features)  # (总样本数, num_windows, 15, num_channels)
    all_labels = np.hstack(all_labels)  # (总样本数,)

    np.save(os.path.join(output_folder, "feature_matrix.npy"), all_features)
    np.save(os.path.join(output_folder, "labels.npy"), all_labels)

    print(f"Feature extraction complete! Shape: {all_features.shape}")
    print(f"Labels saved: {all_labels.shape}")

# **运行处理**
input_folder = "data/data_1_sensor_3_classes/"  # 输入数据文件夹
output_folder = "processed_data"  # 处理后存储的文件夹
process_emg_folder(input_folder, output_folder)


In [None]:
import numpy as np
import os
from sklearn.model_selection import train_test_split

def load_data(data_folder, test_size=0.2, random_state=42):
    """
    加载 `feature_matrix.npy` 和 `labels.npy` 数据，并划分训练集和测试集。

    参数：
    - data_folder: 存放数据的文件夹路径
    - test_size: 测试集比例 (默认 20%)
    - random_state: 随机种子，保证可复现性

    返回：
    - X_train: 训练集特征，形状 (train_samples, num_windows, 15, num_channels)
    - X_test: 测试集特征，形状 (test_samples, num_windows, 15, num_channels)
    - y_train: 训练集标签，形状 (train_samples,)
    - y_test: 测试集标签，形状 (test_samples,)
    """
    # **加载数据**
    feature_path = os.path.join(data_folder, "feature_matrix.npy")
    label_path = os.path.join(data_folder, "labels.npy")

    if not os.path.exists(feature_path) or not os.path.exists(label_path):
        raise FileNotFoundError("特征文件或标签文件未找到，请检查路径！")

    X = np.load(feature_path)  # 形状 (num_samples, num_windows, 15, num_channels)
    y = np.load(label_path)  # 形状 (num_samples,)

    # **数据基本信息**
    print(f"Loaded features from {feature_path}, shape: {X.shape}")
    print(f"Loaded labels from {label_path}, shape: {y.shape}")

    # **划分训练集和测试集**
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # **打印数据划分信息**
    print(f"Training set: X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"Testing set: X_test: {X_test.shape}, y_test: {y_test.shape}")

    return X_train, X_test, y_train, y_test

# **使用示例**
data_folder = "windowed_data"
X_train, X_test, y_train, y_test = load_data(data_folder)
