In [25]:
import pandas as pd
import numpy as np
import os

# 读取数据
file_path = 'emg_data_20250219_160546_3_10.csv'  # 替换为你的文件路径
raw_data = pd.read_csv(file_path)
# 设定采样率
fs = 1000  # 采样率 Hz
cycle_duration = 10  # 每个周期 10 秒
skip_seconds = 4  # 跳过前 4 秒
use_seconds = 6  # 需要保留的秒数

cycle_samples = fs * cycle_duration  # 10 秒的数据点数 = 10000
skip_samples = fs * skip_seconds  # 需要跳过 4 秒 = 4000
use_samples = fs * use_seconds  # 需要保留 6 秒 = 6000

# 创建保存文件夹
output_folder = 'segmented_data'
os.makedirs(output_folder, exist_ok=True)

segments = []  # 存储所有分割后的数据
num_cycles = len(raw_data) // cycle_samples  # 计算完整的10秒周期数

for i in range(num_cycles):
    start_idx = i * cycle_samples + skip_samples  # 每个周期内从第 4 秒开始
    end_idx = start_idx + use_samples  # 取后 6 秒的数据

    if end_idx > len(raw_data):  # 如果索引超出范围，则截断并填充
        segment = raw_data.iloc[start_idx:].values  # 取剩余的数据
        pad_size = use_samples - len(segment)  # 计算需要填充的行数
        segment = np.pad(segment, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)  # 填充 0
    else:
        segment = raw_data.iloc[start_idx:end_idx].values  # 正常提取数据

    segments.append(segment)  # 存入列表

    # 存储到 CSV
    # pd.DataFrame(segment).to_csv(f"{output_folder}/segment_{i+1}.csv", index=False, header=False)

    print(f"Segment {i+1} saved: Rows {start_idx} to {end_idx} (Padded: {pad_size if end_idx > len(raw_data) else 0} rows)")

# 转换为 NumPy 数组
segments_array = np.array(segments)  # 形状: (num_segments, 6000, 数据列数)

# 保存 NumPy 文件
np.save(f"{output_folder}/segments.npy", segments_array)

print(f"Total segments saved: {num_cycles}")
print(f"Final NumPy Shape: {segments_array.shape}")  # 确保形状正确


Segment 1 saved: Rows 4000 to 10000 (Padded: 0 rows)
Segment 2 saved: Rows 14000 to 20000 (Padded: 0 rows)
Segment 3 saved: Rows 24000 to 30000 (Padded: 0 rows)
Total segments saved: 3
Final NumPy Shape: (3, 6000, 2)


In [28]:
import pandas as pd
import numpy as np
import os

# 读取数据
file_path =r'data/emg_data_20250219_160546_3_10.csv'  # 替换为你的文件路径
raw_data = pd.read_csv(file_path)

# 设定采样率
fs = 1000  # 采样率 Hz
cycle_duration = 10  # 每个周期 10 秒
skip_seconds = 4  # 跳过前 4 秒
use_seconds = 6  # 需要保留的秒数

cycle_samples = fs * cycle_duration  # 10 秒的数据点数 = 10000
skip_samples = fs * skip_seconds  # 需要跳过 4 秒 = 4000
use_samples = fs * use_seconds  # 需要保留 6 秒 = 6000

# 滑动窗口参数
window_size = 200  # 200ms = 200 采样点
step_size = 100  # 100ms = 100 采样点
num_windows = (use_samples - window_size) // step_size + 1  # 计算每个片段可以滑动的窗口数量

# 创建保存文件夹
output_folder = 'segmented_data'
os.makedirs(output_folder, exist_ok=True)

segments = []  # 存储所有分割后的数据
num_cycles = len(raw_data) // cycle_samples  # 计算完整的10秒周期数

for i in range(num_cycles):
    start_idx = i * cycle_samples + skip_samples  # 每个周期内从第 4 秒开始
    end_idx = start_idx + use_samples  # 取后 6 秒的数据

    if end_idx > len(raw_data):  # 如果索引超出范围，则截断并填充
        segment = raw_data.iloc[start_idx:].values  # 取剩余的数据
        pad_size = use_samples - len(segment)  # 计算需要填充的行数
        segment = np.pad(segment, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)  # 填充 0
    else:
        segment = raw_data.iloc[start_idx:end_idx].values  # 正常提取数据

    # 滑动窗口切片
    windows = [
        segment[j : j + window_size]  # 取 200 采样点窗口
        for j in range(0, use_samples - window_size + 1, step_size)  # 滑动步长为 100
    ]

    segments.append(np.array(windows))  # 存入列表

    print(f"Segment {i+1} processed: Rows {start_idx} to {end_idx} (Padded: {pad_size if end_idx > len(raw_data) else 0} rows)")

# 转换为 NumPy 数组，形状为 (num_segments, num_windows, 200, 数据列数)
segments_array = np.array(segments)

# 保存 NumPy 文件
np.save(f"{output_folder}/windowed_segments.npy", segments_array)

# 打印最终形状
print(f"Final shape: {segments_array.shape}")


Segment 1 processed: Rows 4000 to 10000 (Padded: 0 rows)
Segment 2 processed: Rows 14000 to 20000 (Padded: 0 rows)
Segment 3 processed: Rows 24000 to 30000 (Padded: 0 rows)
Final shape: (3, 59, 200, 2)


In [31]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.signal import welch
import os

# 载入滑动窗口数据
input_npy_path = "segmented_data/windowed_segments.npy"
windowed_segments = np.load(input_npy_path)  # 形状 (3, 59, 200, 2)

# **获取实际通道数（排除时间列）**
num_batches, num_windows, window_size, num_channels = windowed_segments.shape
print(f"Detected {num_batches} batches, {num_windows} windows per batch, {num_channels} channels.")

# 创建存储文件夹
output_folder = "windowed_data"
os.makedirs(output_folder, exist_ok=True)

# 定义特征提取函数
def extract_features(segment):
    """
    计算一个窗口的 15 个特征, segment 形状: (200, num_channels)
    返回: (15, num_channels) 的特征矩阵
    """
    features = []
    
    for channel in range(segment.shape[1]):  # 遍历通道
        signal = segment[:, channel]  # 取单个通道数据

        # **时域特征**
        VAR = np.var(signal)  # 方差
        MAV = np.mean(np.abs(signal))  # 平均绝对值
        RMS = np.sqrt(np.mean(signal**2))  # 均方根
        SDV = np.std(signal)  # 标准差
        AAC = np.mean(np.abs(np.diff(signal)))  # 平均绝对变化率
        MAX = np.max(signal)  # 最大值
        SSC = np.sum(np.diff(signal) > 0)  # 符号变化数
        ZCR = np.sum(np.diff(np.sign(signal)) != 0)  # 过零率
        KUR = stats.kurtosis(signal)  # 峭度
        SKW = stats.skew(signal)  # 偏度
        WWL = np.sum(np.abs(np.diff(signal)))  # 波形长度
        SSI = np.sum(signal ** 2)  # 积分平方
        LGD = np.log10(np.mean(signal**2) + 1e-10)  # 对数能量

        # **频域特征**
        freqs, psd = welch(signal, fs=1000, nperseg=200)  # 计算功率谱密度
        ARC = np.sum(psd)  # 频谱面积
        MFR = np.sum(freqs * psd) / (np.sum(psd) + 1e-10)  # 频率质心
        
        # 存入特征列表
        features.append([VAR, MAV, RMS, SDV, AAC, MAX, SSC, ZCR, KUR, SKW, WWL, SSI, LGD, ARC, MFR])

    return np.array(features).T  # 变成 (15, num_channels)

# 遍历所有 batch 计算特征
features_batches = []
labels = np.array([1, 2, 3])  # 3个 batch，分别标 1, 2, 3

for batch_idx in range(num_batches):
    batch_features = []  # 存储当前 batch 所有窗口的特征
    for window_idx in range(num_windows):
        window = windowed_segments[batch_idx, window_idx]  # 取单个窗口数据 (200, 2)
        features = extract_features(window)  # 计算 (15, 2)
        batch_features.append(features)  # 存入当前 batch

    features_batches.append(np.array(batch_features))  # 存入 batch 级别的列表

# **转换为 NumPy 数组，形状为 (3, 59, 15, 2)**
features_array = np.array(features_batches)

# **保存 NumPy 特征和标签**
np.save(os.path.join(output_folder, "feature_matrix.npy"), features_array)
np.save(os.path.join(output_folder, "labels.npy"), labels)

# **打印结果**
print(f"Feature extraction complete! Shape: {features_array.shape}")
print(f"Labels saved: {labels}")
print(f"Feature matrix saved at: {output_folder}/feature_matrix.npy")
print(f"Labels saved at: {output_folder}/labels.npy")


Detected 3 batches, 59 windows per batch, 2 channels.
Feature extraction complete! Shape: (3, 59, 15, 2)
Labels saved: [1 2 3]
Feature matrix saved at: windowed_data/feature_matrix.npy
Labels saved at: windowed_data/labels.npy


In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.signal import welch
import os

def process_emg_folder(input_folder, output_folder):
    """
    遍历 input_folder 下的所有 CSV 文件，处理 EMG 数据，并保存特征矩阵和标签到 output_folder。
    """
    # 设定采样率
    fs = 1000  # 采样率 Hz
    cycle_duration = 10  # 每个周期 10 秒
    skip_seconds = 4  # 跳过前 4 秒
    use_seconds = 6  # 需要保留的秒数

    cycle_samples = fs * cycle_duration  # 10 秒数据点数 = 10000
    skip_samples = fs * skip_seconds  # 跳过 4 秒 = 4000
    use_samples = fs * use_seconds  # 取后 6 秒 = 6000

    # 滑动窗口参数
    window_size = 200  # 200ms = 200 采样点
    step_size = 100  # 100ms = 100 采样点
    num_windows = (use_samples - window_size) // step_size + 1  # 计算窗口数

    # 创建存储文件夹
    os.makedirs(output_folder, exist_ok=True)

    # 存储所有数据和标签
    all_features = []
    all_labels = []
    
    # 遍历文件夹中的所有 CSV 文件
    csv_files = sorted([f for f in os.listdir(input_folder) if f.endswith(".csv")])
    
    for file_idx, file_name in enumerate(csv_files):
        file_path = os.path.join(input_folder, file_name)
        print(f"Processing {file_name} ({file_idx+1}/{len(csv_files)})")

        # 读取数据
        raw_data = pd.read_csv(file_path)

        # **自动获取通道数（忽略时间列，从第 2 列开始）**
        num_channels = raw_data.shape[1] - 1
        print(f"Detected {num_channels} channels (excluding time column).")

        segments = []  # 存储所有分割后的数据
        labels = []  # 存储当前文件的标签
        num_cycles = 3  # 每个文件固定分 3 段

        for i in range(num_cycles):
            start_idx = i * cycle_samples + skip_samples  # 跳过前 4 秒
            end_idx = start_idx + use_samples  # 取后 6 秒

            if end_idx > len(raw_data):  # 处理不足 6000 采样点的情况
                segment = raw_data.iloc[start_idx:].values  # 取剩余数据
                pad_size = use_samples - len(segment)  # 计算填充数
                segment = np.pad(segment, ((0, pad_size), (0, 0)), mode='constant', constant_values=0)  # 填充 0
            else:
                segment = raw_data.iloc[start_idx:end_idx].values  # 正常提取数据

            # **滑动窗口**
            windows = [
                segment[j:j + window_size, 1:]  # 取 200 采样点，忽略时间列
                for j in range(0, use_samples - window_size + 1, step_size)  # 滑动步长 100
            ]
            segments.append(np.array(windows))

            # **生成标签**
            labels.append(i + 1)  # 第 1 段 = 1, 第 2 段 = 2, 第 3 段 = 3

        # **转换为 NumPy 数组**
        segments_array = np.array(segments)  # 形状 (num_segments=3, num_windows, 200, num_channels)

        # **计算特征**
        features_batches = []
        for batch_idx in range(segments_array.shape[0]):  # 3 个 batch
            batch_features = []  # 存储当前 batch 的所有窗口特征
            for window_idx in range(segments_array.shape[1]):  # 计算每个窗口
                window = segments_array[batch_idx, window_idx]  # (200, num_channels)
                features = extract_features(window)  # 计算 (15, num_channels)
                batch_features.append(features)  # 存入 batch

            features_batches.append(np.array(batch_features))

        features_array = np.array(features_batches)  # (3, num_windows, 15, num_channels)

        # **存储数据**
        all_features.append(features_array)
        all_labels.extend(labels)  # 直接扩展标签列表

    # **最终转换为 NumPy 数组**
    all_features = np.vstack(all_features)  # 合并所有 batch，形状 (总 batch, num_windows, 15, num_channels)
    all_labels = np.array(all_labels)  # (总 batch,)

    # **保存**
    np.save(os.path.join(output_folder, "feature_matrix.npy"), all_features)
    np.save(os.path.join(output_folder, "labels.npy"), all_labels)

    print(f"Feature extraction complete! Shape: {all_features.shape}")
    print(f"Labels saved: {all_labels.shape}")
    print(f"Feature matrix saved at: {output_folder}/feature_matrix.npy")
    print(f"Labels saved at: {output_folder}/labels.npy")

def extract_features(segment):
    """
    计算一个窗口的 15 个特征, segment 形状: (200, num_channels)
    返回: (15, num_channels) 的特征矩阵
    """
    features = []
    
    for channel in range(segment.shape[1]):  # 遍历通道
        signal = segment[:, channel]  # 取单个通道数据

        # **时域特征**
        VAR = np.var(signal)  # 方差
        MAV = np.mean(np.abs(signal))  # 平均绝对值
        RMS = np.sqrt(np.mean(signal**2))  # 均方根
        SDV = np.std(signal)  # 标准差
        AAC = np.mean(np.abs(np.diff(signal)))  # 平均绝对变化率
        MAX = np.max(signal)  # 最大值
        SSC = np.sum(np.diff(signal) > 0)  # 符号变化数
        ZCR = np.sum(np.diff(np.sign(signal)) != 0)  # 过零率
        KUR = stats.kurtosis(signal)  # 峭度
        SKW = stats.skew(signal)  # 偏度
        WWL = np.sum(np.abs(np.diff(signal)))  # 波形长度
        SSI = np.sum(signal ** 2)  # 积分平方
        LGD = np.log10(np.mean(signal**2) + 1e-10)  # 对数能量

        # **频域特征**
        freqs, psd = welch(signal, fs=1000, nperseg=200)  # 计算功率谱密度
        ARC = np.sum(psd)  # 频谱面积
        MFR = np.sum(freqs * psd) / (np.sum(psd) + 1e-10)  # 频率质心
        
        # 存入特征列表
        features.append([VAR, MAV, RMS, SDV, AAC, MAX, SSC, ZCR, KUR, SKW, WWL, SSI, LGD, ARC, MFR])

    return np.array(features).T  # 变成 (15, num_channels)

# **运行函数**
input_folder = r"E:\MSC\Spring\AML\GestureLink\Data_grove"
output_folder = "windowed_data"
process_emg_folder(input_folder, output_folder)


Processing sensor_data_20250228_164403.csv (1/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_164644.csv (2/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_164741.csv (3/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_164835.csv (4/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_164937.csv (5/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_165210.csv (6/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_165306.csv (7/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_165502.csv (8/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_165752.csv (9/10)
Detected 7 channels (excluding time column).
Processing sensor_data_20250228_165916.csv (10/10)
Detected 7 channels (excluding time column).
Feature extraction complete! Shape: (30, 59, 15, 

In [2]:
import numpy as np
import os
from sklearn.model_selection import train_test_split

def load_data(data_folder, test_size=0.2, random_state=42):
    """
    加载 `feature_matrix.npy` 和 `labels.npy` 数据，并划分训练集和测试集。

    参数：
    - data_folder: 存放数据的文件夹路径
    - test_size: 测试集比例 (默认 20%)
    - random_state: 随机种子，保证可复现性

    返回：
    - X_train: 训练集特征，形状 (train_batches, num_windows, 15, num_channels)
    - X_test: 测试集特征，形状 (test_batches, num_windows, 15, num_channels)
    - y_train: 训练集标签，形状 (train_batches,)
    - y_test: 测试集标签，形状 (test_batches,)
    """
    # **加载数据**
    feature_path = os.path.join(data_folder, "feature_matrix.npy")
    label_path = os.path.join(data_folder, "labels.npy")

    if not os.path.exists(feature_path) or not os.path.exists(label_path):
        raise FileNotFoundError("特征文件或标签文件未找到，请检查路径！")

    X = np.load(feature_path)  # 形状 (num_batches, num_windows, 15, num_channels)
    y = np.load(label_path)  # 形状 (num_batches,)

    # **数据基本信息**
    print(f"Loaded features from {feature_path}, shape: {X.shape}")
    print(f"Loaded labels from {label_path}, shape: {y.shape}")

    # **划分训练集和测试集**
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_state
        )

    # **打印数据划分信息**
    print(f"Training set: X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"Testing set: X_test: {X_test.shape}, y_test: {y_test.shape}")

    return X_train, X_test, y_train, y_test

# **使用示例**
data_folder = "windowed_data"
X_train, X_test, y_train, y_test = load_data(data_folder)



Loaded features from windowed_data\feature_matrix.npy, shape: (45, 59, 15, 1)
Loaded labels from windowed_data\labels.npy, shape: (45,)
Training set: X_train: (36, 59, 15, 1), y_train: (36,)
Testing set: X_test: (9, 59, 15, 1), y_test: (9,)
