In [2]:
import pandas as pd
import numpy as np


# 定义处理单个文件的函数
def process_file(file_path, window_size=4):
    data = pd.read_csv(file_path)

    # 获取所有特征，删除Date列
    features = data.drop(columns=["Date"]).values
    targets = data["CGM (mg / dl)"].values

    X = []
    y = []

    for i in range(len(data) - 2 * window_size):
        X.append(features[i : i + window_size])  # 保持时间窗口内的特征维度
        y.append(
            targets[i + window_size : i + 2 * window_size]
        )  # 目标是下四个时间点的血糖值

    return np.array(X), np.array(y)


# 测试处理单个文件
file_path = "dataset/T1DM/1001_0_20210730.csv"
X, y = process_file(file_path)

X.shape, y.shape, X[:1], y[:1]

((650, 4, 15),
 (650, 4),
 array([[[ 1.13400000e+02,  0.00000000e+00,  3.00000000e-01,
           0.00000000e+00, -9.44089020e-01, -3.29690645e-01,
           1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+01,  3.52800000e+02,
           3.63955000e+02,  1.15311000e+02,  4.07000000e+01],
         [ 1.24200000e+02,  0.00000000e+00,  3.00000000e-01,
           0.00000000e+00, -9.63630453e-01, -2.67238376e-01,
           1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+01,  3.52800000e+02,
           3.63955000e+02,  1.15311000e+02,  4.07000000e+01],
         [ 1.29600000e+02,  0.00000000e+00,  3.00000000e-01,
           6.65100000e+01, -9.79045472e-01, -2.03641751e-01,
           1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
           0.00000000e+00,  1.00000000e+01,  3.52800000e+02,
           3.63955000e+02,  1.15311000e+02,  4.07000000e+01],
         [ 1.42200000e+02,  0.00000000e+00,  3.00000000e

In [4]:
import pandas as pd
import numpy as np
import os


# 定义处理单个文件的函数
def process_file(file_path, window_size=4):
    data = pd.read_csv(file_path)

    # 获取所有特征，删除Date列
    features = data.drop(columns=["Date"]).values
    targets = data["CGM (mg / dl)"].values

    X = []
    y = []

    for i in range(len(data) - 2 * window_size):
        X.append(features[i : i + window_size])  # 保持时间窗口内的特征维度
        y.append(
            targets[i + window_size : i + 2 * window_size]
        )  # 目标是下四个时间点的血糖值

    return np.array(X), np.array(y)


# 文件夹路径
directory_paths = ["dataset/T1DM", "dataset/T2DM"]

# 处理所有CSV文件并组合结果
all_X = []
all_y = []

# 用于存储每个文件夹中的数据量
folder_data_counts = {}

for directory_path in directory_paths:
    folder_X = []
    folder_y = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory_path, filename)
            try:
                X, y = process_file(file_path)
                folder_X.append(X)
                folder_y.append(y)
            except Exception as e:
                print(f"Error processing file {filename} in {directory_path}: {e}")

    # 记录每个文件夹中的数据量
    folder_X = np.concatenate(folder_X, axis=0)
    folder_y = np.concatenate(folder_y, axis=0)
    folder_data_counts[directory_path] = len(folder_X)

    all_X.append(folder_X)
    all_y.append(folder_y)

all_X = np.concatenate(all_X, axis=0)
all_y = np.concatenate(all_y, axis=0)

# 显示结果数据的形状
print("All X shape:", all_X.shape)
print("All y shape:", all_y.shape)

# 检查前几个样本
print("Sample X:", all_X[:1])
print("Sample y:", all_y[:1])

# 计算并显示各文件夹数据的比重
total_data_count = len(all_X)
for folder, count in folder_data_counts.items():
    proportion = count / total_data_count
    print(f"Data proportion from {folder}: {proportion:.2%}")

All X shape: (127170, 4, 15)
All y shape: (127170, 4)
Sample X: [[[ 1.13400000e+02  0.00000000e+00  3.00000000e-01  0.00000000e+00
   -9.44089020e-01 -3.29690645e-01  1.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  1.00000000e+01  3.52800000e+02
    3.63955000e+02  1.15311000e+02  4.07000000e+01]
  [ 1.24200000e+02  0.00000000e+00  3.00000000e-01  0.00000000e+00
   -9.63630453e-01 -2.67238376e-01  1.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  1.00000000e+01  3.52800000e+02
    3.63955000e+02  1.15311000e+02  4.07000000e+01]
  [ 1.29600000e+02  0.00000000e+00  3.00000000e-01  6.65100000e+01
   -9.79045472e-01 -2.03641751e-01  1.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  1.00000000e+01  3.52800000e+02
    3.63955000e+02  1.15311000e+02  4.07000000e+01]
  [ 1.42200000e+02  0.00000000e+00  3.00000000e-01  0.00000000e+00
   -9.90268069e-01 -1.39173101e-01  1.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  1.00000

In [2]:
import pandas as pd
import numpy as np
import os


# 定义处理单个文件的函数
def process_file(file_path, window_size=4):
    data = pd.read_csv(file_path)

    # 获取所有特征，删除Date列
    features = data.drop(columns=["Date"]).values
    targets = data["CGM (mg / dl)"].values

    X = []
    y = []

    for i in range(len(data) - 2 * window_size):
        X.append(features[i : i + window_size])  # 保持时间窗口内的特征维度
        y.append(
            targets[i + window_size : i + 2 * window_size]
        )  # 目标是下四个时间点的血糖值

    return np.array(X), np.array(y)


# 文件夹路径
directory_paths = ["dataset/T1DM", "dataset/T2DM"]

# 处理所有CSV文件并组合结果
all_X = []
all_y = []

# 用于存储每个文件夹中的数据量
folder_data_counts = {}

for directory_path in directory_paths:
    folder_X = []
    folder_y = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory_path, filename)
            try:
                X, y = process_file(file_path)
                folder_X.append(X)
                folder_y.append(y)
            except Exception as e:
                print(f"Error processing file {filename} in {directory_path}: {e}")

    # 合并文件夹内的所有数据
    folder_X = np.concatenate(folder_X, axis=0)
    folder_y = np.concatenate(folder_y, axis=0)

    # 对T1DM数据进行随机上采样2.5倍
    if "T1DM" in directory_path:
        upsample_indices = np.random.choice(
            len(folder_X), size=int(len(folder_X) * 2.5), replace=True
        )
        folder_X = folder_X[upsample_indices]
        folder_y = folder_y[upsample_indices]

    # 对T2DM数据进行随机下采样40%
    if "T2DM" in directory_path:
        sample_indices = np.random.choice(
            len(folder_X), size=int(len(folder_X) * 0.4), replace=False
        )
        folder_X = folder_X[sample_indices]
        folder_y = folder_y[sample_indices]

    # 记录每个文件夹中的数据量（采样后）
    folder_data_counts[directory_path] = len(folder_X)

    all_X.append(folder_X)
    all_y.append(folder_y)

all_X = np.concatenate(all_X, axis=0)
all_y = np.concatenate(all_y, axis=0)

# 显示结果数据的形状
print("All X shape:", all_X.shape)
print("All y shape:", all_y.shape)

# 检查前几个样本
print("Sample X:", all_X[:1])
print("Sample y:", all_y[:1])

# 计算并显示各文件夹数据的比重（采样后）
total_data_count = len(all_X)
for folder, count in folder_data_counts.items():
    proportion = count / total_data_count
    print(f"Data proportion from {folder}: {proportion:.2%}")

All X shape: (83558, 4, 15)
All y shape: (83558, 4)
Sample X: [[[ 1.51200000e+02  0.00000000e+00  9.00000000e-01  0.00000000e+00
    8.92978943e-01 -4.50098441e-01  0.00000000e+00  0.00000000e+00
    0.00000000e+00  2.00000000e+00  2.60000000e+01  1.81800000e+02
    7.54710000e+02  6.94050000e+01  1.96000000e+01]
  [ 1.42200000e+02  0.00000000e+00  6.00000000e-01  0.00000000e+00
    8.61629160e-01 -5.07538363e-01  0.00000000e+00  0.00000000e+00
    0.00000000e+00  2.00000000e+00  2.60000000e+01  1.81800000e+02
    7.54710000e+02  6.94050000e+01  1.96000000e+01]
  [ 1.31400000e+02  0.00000000e+00  6.00000000e-01  0.00000000e+00
    8.26589749e-01 -5.62804928e-01  0.00000000e+00  0.00000000e+00
    0.00000000e+00  2.00000000e+00  2.60000000e+01  1.81800000e+02
    7.54710000e+02  6.94050000e+01  1.96000000e+01]
  [ 1.13400000e+02  0.00000000e+00  6.00000000e-01  0.00000000e+00
    7.88010754e-01 -6.15661475e-01  0.00000000e+00  0.00000000e+00
    0.00000000e+00  2.00000000e+00  2.6000000