In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.signal import find_peaks, peak_prominences
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb

In [2]:
df = pd.read_csv(r'./features.csv', sep='\t', encoding='utf-8')

In [3]:
df = df.dropna().drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [4]:
df.columns

Index(['calories2hr', 'protein2hr', 'sugar2hr', 'carbs2hr', 'calories8hr',
       'protein8hr', 'sugar8hr', 'carbs8hr', 'calories24hr', 'protein24hr',
       'sugar24hr', 'carbs24hr', 'eat', 'eatcnt2hr', 'eatcnt8hr', 'eatcnt24hr',
       'eatmean2hr', 'eatmean8hr', 'eatmean24hr', 'acc_mean', 'acc_std',
       'acc_min', 'acc_max', 'acc_q1g', 'acc_q3g', 'acc_skew', 'acc_mean_2hrs',
       'acc_max_2hrs', 'eda_mean', 'eda_std', 'eda_min', 'eda_max', 'eda_q1g',
       'eda_q3g', 'eda_skew', 'PeakEDA', 'PeakEDA2hr_sum', 'PeakEDA2hr_mean',
       'hr_mean', 'hr_std', 'hr_min', 'hr_max', 'hr_q1g', 'hr_q3g', 'hr_skew',
       'maxHRV', 'minHRV', 'medianHRV', 'meanHRV', 'SDNN', 'NN50', 'pNN50',
       'RMSSD', 'temp_mean', 'temp_std', 'temp_min', 'temp_max', 'temp_q1g',
       'temp_q3g', 'temp_skew', 'Activity_bouts', 'Activity24', 'Activity1hr',
       'Minfrommid', 'Hourfrommid', 'glucose', 'datetime', 'ID', 'HbA1c',
       'Biological Sex', 'WakeTime', 'label'],
      dtype='object')

In [5]:
df.head(30)

Unnamed: 0,calories2hr,protein2hr,sugar2hr,carbs2hr,calories8hr,protein8hr,sugar8hr,carbs8hr,calories24hr,protein24hr,...,Activity1hr,Minfrommid,Hourfrommid,glucose,datetime,ID,HbA1c,Biological Sex,WakeTime,label
0,0.0,0.0,0.0,0.0,638.0,17.9,30.7,44.4,1812.0,110.3,...,8.0,1048,17,95.0,2020-02-14 17:28:30,1.0,5.5,FEMALE,0.0,PersNorm
1,0.0,0.0,0.0,0.0,638.0,17.9,30.7,44.4,1812.0,110.3,...,7.0,1053,18,95.0,2020-02-14 17:33:30,1.0,5.5,FEMALE,0.0,PersNorm
2,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1812.0,110.3,...,7.0,1058,18,95.0,2020-02-14 17:38:30,1.0,5.5,FEMALE,0.0,PersNorm
3,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1812.0,110.3,...,6.0,1063,18,95.0,2020-02-14 17:43:30,1.0,5.5,FEMALE,0.0,PersNorm
4,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1812.0,110.3,...,5.0,1069,18,92.0,2020-02-14 17:48:31,1.0,5.5,FEMALE,0.0,PersNorm
5,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1812.0,110.3,...,5.0,1073,18,88.0,2020-02-14 17:53:30,1.0,5.5,FEMALE,0.0,PersNorm
6,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1812.0,110.3,...,4.0,1078,18,84.0,2020-02-14 17:58:30,1.0,5.5,FEMALE,0.0,PersLow
7,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1356.0,94.3,...,3.0,1083,18,85.0,2020-02-14 18:03:30,1.0,5.5,FEMALE,0.0,PersNorm
8,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1356.0,94.3,...,2.0,1088,18,89.0,2020-02-14 18:08:30,1.0,5.5,FEMALE,0.0,PersNorm
9,0.0,0.0,0.0,0.0,358.0,13.9,8.7,14.4,1356.0,94.3,...,3.0,1093,18,93.0,2020-02-14 18:13:30,1.0,5.5,FEMALE,0.0,PersNorm


#### HRV를 통해 당뇨병 발병 가능성 체크
##### Random forest 방법을 이용하여 예측

In [6]:
features = df[['maxHRV', 'minHRV', 'medianHRV', 'meanHRV']]
target = df['label']

le = LabelEncoder()
target_encoded = le.fit_transform(target)

X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

predictions_proba = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, model.predict(X_test))    # 올바르게 예측한 케이스의 비율
roc_auc = roc_auc_score(y_test, predictions_proba, multi_class='ovr', average='macro')  # 1에 가까울 수록 모델 성능 좋음. 이진 분류

print(f"Accuracy: {accuracy : .3f}")
print(f"ROC AUC Score: {roc_auc : .3f}")

Accuracy:  0.712
ROC AUC Score:  0.673


##### LightGBM을 사용해서 예측

In [7]:
features = df[['maxHRV', 'minHRV', 'medianHRV', 'meanHRV']]
target = df['label']

le = LabelEncoder()
target_encoded = le.fit_transform(target)

X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': len(set(target_encoded)),
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1
}

# (훈련 라운드의 수를 1000번으로 한 이유: 더 많이 돌면 과적합이 될까봐.)
model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=1000)

# 예측 (확률 최대값을 가진 클래스 인덱스를 예측으로 선택)
y_pred = [np.argmax(line) for line in model.predict(X_test, num_iteration=model.best_iteration)]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict(X_test, num_iteration=model.best_iteration), multi_class='ovr')

print(f"Accuracy: {accuracy : .3f}")
print(f"ROC AUC Score: {roc_auc : .3f}")

Accuracy:  0.724
ROC AUC Score:  0.667
