In [20]:
import pandas as pd
import numpy as np


# 定义一个函数将分钟数转换为周期性特征
def time_to_cyclical(minute_series, max_val=1440):  # 1440 分钟表示一天的总分钟数
    sin_vals = np.sin(2 * np.pi * minute_series / max_val)
    cos_vals = np.cos(2 * np.pi * minute_series / max_val)
    return sin_vals, cos_vals

def solve1(name):

    # 读取数据摘要文件
    data_summary = pd.read_csv(f"data/Shanghai_{name}_Summary.csv")

    # 选择需要复制的特定列
    selected_columns = [
        "Duration of Diabetes (years)",
        "Fasting Plasma Glucose (mg/dl)",
        "2-hour Postprandial Insulin (pmol/L)",
        "HbA1c (mmol/mol)",
        "Glycated Albumin (%)",
    ]

    # 药物相关度字典
    drug_correlations = {
        "Humulin R": 1.365444158221848,
        "insulin aspart 70/30": 1.06324690222164,
        "voglibose": 0.946377183963179,
        "metformin": 0.6986628850317034,
        "Novolin R": 0.6854082306114344,
        "sitagliptin": 0.6852370154880306,
        "insulin degludec": 0.5551849984711914,
        "insulin glargine": 0.5544288133650825,
        "insulin glarigine": 0.5064889597678641,
        "Gansulin R": 0.4889905571280143,
        "glimepiride": 0.45700893221455063,
        "insulin aspart 50/50": 0.45700893221455063,
        "Novolin 30R": 0.44981075394747994,
        "Novolin 50R": 0.43604410870946597,
        "pioglitazone": 0.43580275030128934,
        "insulin glulisine": 0.4347913458122733,
        "insulin detemir": 0.41448751289298663,
        "dapagliflozin": 0.3989705959581452,
        "canagliflozin": 0.39839570463199087,
        "repaglinide": 0.3973211454109254,
        "insulin aspart": 0.37804370240755825,
        "Gansulin 40R": 0.36299685165116846,
        "acarbose": 0.3312078141826219,
        "Humulin 70/30": 0.2605611291192308,
        "gliquidone": 0.15197177803775078,
        "gliclazide": 0.1351941319739795,
        "liraglutide": 0.12751629718275428,
    }

    # 获取患者ID列
    id_column = data_summary["Patient Number"]

    # 读取药物特征数据
    drug_data = pd.read_csv("data/hypoglycemic_agents.csv")


    for id_value in id_column:
        # 读取每个病人的数据文件
        data = pd.read_csv(f"data/Shanghai_{name}/" + str(id_value) + ".csv")

        # 处理日期列，提取小时和分钟
        data["Date"] = pd.to_datetime(data["Date"])
        data["Minute_of_day"] = data["Date"].dt.hour * 60 + data["Date"].dt.minute

        # 将分钟数转换为周期性特征
        data["Minute_sin"], data["Minute_cos"] = time_to_cyclical(data["Minute_of_day"])

        # 删除中间变量Minute_of_day
        data.drop(
            columns=["Minute_of_day", "CSII - bolus insulin (Novolin R, IU)"], inplace=True
        )


        # 获取当前病人的药物使用信息
        patient_drug_data = drug_data[drug_data["Patient Number"] == id_value]

        # 将需要单独列为特征的药物特征添加到数据中
        for drug in ["Humulin R", "insulin aspart 70/30", "voglibose"]:
            if drug in patient_drug_data.columns:
                data[drug] = patient_drug_data[drug].values[0]

        # 处理相关性在0.5到0.75之间的药物特征
        medium_corr_drugs = [
            drug for drug, corr in drug_correlations.items() if 0.5 <= corr < 0.75
        ]
        if not patient_drug_data.empty:
            data["medium_corr_drugs"] = (
                patient_drug_data[medium_corr_drugs].sum(axis=1).values[0]
            )

        # 获取当前病人的摘要信息
        patient_summary = data_summary[data_summary["Patient Number"] == id_value][
            selected_columns
        ]

        # 将病人的摘要信息复制到每一行
        for col in selected_columns:
            data[col] = patient_summary.iloc[0][col]

        # 保存更新后的数据
        data.to_csv(f"dataset/{name}/" + str(id_value) + ".csv", index=False)


solve1("T1DM")
solve1("T2DM")