# Baseline

1. raw data 모든 컬럼(int,float형 데이터) 사용 (이상치 제거X)
2. 라벨 인코딩
3. cross validation

# Import

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

# Random Seed 설정

In [96]:
RANDOM_SEED = 2025

In [97]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2  # 1MB = 1024**2 Byte
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object":
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast="integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast="float")
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem) / start_mem))

    return df

# Data Load

- csv 파일명을 수정해야함 (training_activity -> train_activity, val_activity -> validation_activity)

In [98]:
def load_data(base_path: str, dataset_type: str):
    """ 데이터를 불러오는 함수 """
    label_path = f"{base_path}/{dataset_type}/label/"
    raw_path = f"{base_path}/{dataset_type}/raw/"
    
    label = pd.read_csv(f"{label_path}1.걸음걸이/{dataset_type}_label.csv")
    activity = pd.read_csv(f"{raw_path}{dataset_type}_activity.csv")
    sleep = pd.read_csv(f"{raw_path}{dataset_type}_sleep.csv")
    
    return label, activity, sleep

In [99]:
BASE_PATH = "../data"
train_label, train_activity, train_sleep = load_data(BASE_PATH, "train")
test_label, test_activity, test_sleep = load_data(BASE_PATH, "validation")

# Feature Engineering

In [100]:
def preprocess_and_drop_features(df):
    """ 데이터 전처리 및 불필요한 피처 제거 """
    # 전처리 후 이메일
    emails = df["EMAIL"]
    result_df = df.select_dtypes(include=[np.number])
    result_df = result_df[result_df.columns.difference(['DIAG_NM'])]
    
    print(f"전처리 후 피처 수: {len(result_df.columns)}")
    print(f"전처리 후 사용되는 피처: {result_df.columns}")
    
    return result_df, emails

In [None]:
def merge_and_process_data(activity_df, sleep_df, label_df):
    """ 이메일 기준으로 데이터 병합 및 레이블 생성 """
    df = pd.concat([activity_df, sleep_df.drop(columns=['EMAIL'], errors='ignore')], axis=1)
    label_df = label_df.rename(columns={"SAMPLE_EMAIL": "EMAIL"})
    df = df.merge(label_df, how='inner', on='EMAIL')
    df = downcast(df)
    
    X, emails = preprocess_and_drop_features(df)
    y = df.iloc[:,-1]
    
    print(f'X shape: {X.shape}, y shape: {y.shape}')
    
    return X, y, emails

In [102]:
X_train, y_train, train_emails = merge_and_process_data(train_activity, train_sleep, train_label)

56.7% 압축됨
전처리 후 피처 수: 51
전처리 후 사용되는 피처: Index(['activity_average_met', 'activity_cal_active', 'activity_cal_total',
       'activity_daily_movement', 'activity_high', 'activity_inactive',
       'activity_inactivity_alerts', 'activity_low', 'activity_medium',
       'activity_met_min_high', 'activity_met_min_inactive',
       'activity_met_min_low', 'activity_met_min_medium', 'activity_non_wear',
       'activity_rest', 'activity_score', 'activity_score_meet_daily_targets',
       'activity_score_move_every_hour', 'activity_score_recovery_time',
       'activity_score_stay_active', 'activity_score_training_frequency',
       'activity_score_training_volume', 'activity_steps', 'activity_total',
       'sleep_awake', 'sleep_breath_average', 'sleep_deep', 'sleep_duration',
       'sleep_efficiency', 'sleep_hr_average', 'sleep_hr_lowest',
       'sleep_is_longest', 'sleep_light', 'sleep_midpoint_at_delta',
       'sleep_midpoint_time', 'sleep_onset_latency', 'sleep_period_id',
       'slee

# 모델 생성

In [119]:
# 모델 학습 및 평가
models = {
    "Lasso Regression": LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr'),
    "Decision Tree": DecisionTreeClassifier(random_state=RANDOM_SEED),
    "Support Vector Machine": SVC(kernel='linear', probability=True, random_state=RANDOM_SEED),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_SEED),
    "Random Forest": RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
}

# 모델 학습 및 성능 검증

## Standard Scaler

In [120]:
def scale_features(X):    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)  # 표준화를 적용
    return scaler, X_scaled

In [121]:
scaler, X_train_scaled = scale_features(X_train)

## Label Encoding

In [122]:
def encode_label(y):
    """ 클래스 레이블을 숫자로 변환 """
    label_encoder = {"CN": 0, "MCI": 1, "Dem": 2}
    return y.map(label_encoder)

In [123]:
y_train_encoded = encode_label(y_train)

## StratifiedGroupKFold 설정

In [124]:
N_SPLITS = 3
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

# train/val 인덱스 저장
sgkf_splits = []
for train_idx, val_idx in sgkf.split(X_train_scaled, y_train_encoded, train_emails):
    sgkf_splits.append((train_idx, val_idx))

## 일별 성능 평가 함수

In [None]:
def evaluate_daily_performance(y_true, y_pred):
    """
    일별 성능 평가
    
    Args:
        y_true (pd.Series): 실제 레이블
        y_pred (np.ndarray): 예측 레이블
    
    Returns:
        dict: Accuracy, F1-score, 그리고 Classification Report 데이터프레임
    """
    daily_acc = accuracy_score(y_true, y_pred)
    daily_f1 = f1_score(y_true, y_pred, average="weighted")
    
    # Classification Report 생성
    report_daily = classification_report(y_true, y_pred, output_dict=True)
    report_daily_df = pd.DataFrame(report_daily).T
    
    return {
        "accuracy": daily_acc,
        "f1_score": daily_f1,
        "classification_report": report_daily_df
    }

## 사람별 성능 평가 함수

In [None]:
def evaluate_person_performance(val_emails, y_true, y_pred):
    """
    사람별 성능 평가 (EMAIL 단위 그룹화)
    
    Args:
        val_emails (pd.Series): EMAIL 정보
        y_true (pd.Series): 실제 레이블
        y_pred (np.ndarray): 예측 레이블
    
    Returns:
        dict: Accuracy, F1-score, 그리고 Classification Report 데이터프레임
    """
    val_data = pd.DataFrame({"EMAIL": val_emails, "label": y_true, "pred": y_pred})
    grouped = val_data.groupby("EMAIL")

    email_true_labels = []
    email_pred_labels = []

    for email, group in grouped:
        # 실제 레이블: 해당 EMAIL의 모든 레이블 중 첫 번째 값 (모든 데이터가 동일하다고 가정)
        true_label = group["label"].iloc[0]
        email_true_labels.append(true_label)

        # 예측 레이블: 해당 EMAIL의 예측값 중 최빈값 (빈도 동일 시 큰 값 선택)
        pred_counter = Counter(group["pred"])
        most_common = pred_counter.most_common()  # [(label1, count1), (label2, count2), ...]
        max_count = most_common[0][1]  # 최빈값의 빈도수
        candidates = [label for label, count in most_common if count == max_count]  # 빈도가 같은 후보들
        most_common_label = max(candidates)  # 빈도가 동일한 경우 가장 큰 값 선택
        email_pred_labels.append(most_common_label)

    person_acc = accuracy_score(email_true_labels, email_pred_labels)
    person_f1 = f1_score(email_true_labels, email_pred_labels, average="weighted")
    
    # Classification Report 생성
    report_person = classification_report(email_true_labels, email_pred_labels, output_dict=True)
    report_person_df = pd.DataFrame(report_person).T

    return {
        "accuracy": person_acc,
        "f1_score": person_f1,
        "classification_report": report_person_df
    }

In [127]:
final_models = {} # 최종 모델 저장

for name, model in models.items():
    
    daily_acc_scores = []  # 일별 Accuracy 저장
    daily_f1_scores = []   # 일별 F1-score 저장
    person_acc_scores = []  # 사람별 Accuracy 저장
    person_f1_scores = []   # 사람별 F1-score 저장
    class_report_list_daily = []  # 일별 Classification Report 저장
    class_report_list_person = []  # 사람별 Classification Report 저장
    
    for fold_idx, (train_idx, val_idx) in enumerate(sgkf_splits):
        X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_train_fold, y_val_fold = y_train_encoded.iloc[train_idx], y_train_encoded.iloc[val_idx]
        val_emails = train_emails.iloc[val_idx]  # Validation 데이터의 EMAIL 컬럼

        # 모델 학습 및 예측
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        
        # === 일별 성능 평가 ===
        daily_result = evaluate_daily_performance(y_val_fold, y_pred)
        daily_acc_scores.append(daily_result["accuracy"])
        daily_f1_scores.append(daily_result["f1_score"])
        class_report_list_daily.append(daily_result["classification_report"])

        # === 사람별 성능 평가 ===
        person_result = evaluate_person_performance(val_emails, y_val_fold, y_pred)
        person_acc_scores.append(person_result["accuracy"])
        person_f1_scores.append(person_result["f1_score"])
        class_report_list_person.append(person_result["classification_report"])

    # === 평균 계산 및 결과 출력 ===

    # 일별 평균 Classification Report 계산
    avg_class_report_daily = pd.concat(class_report_list_daily).groupby(level=0).mean()

    # 사람별 평균 Classification Report 계산
    avg_class_report_person = pd.concat(class_report_list_person).groupby(level=0).mean()

    print("###", f"Model: {name}", "###")

    print("\n=== Daily Performance ===")
    print(f"Average Daily Accuracy: {np.mean(daily_acc_scores):.2f}")
    print(f"Average Daily F1-score: {np.mean(daily_f1_scores):.2f}")
    print("=== Average Daily Classification Report ===")
    print(avg_class_report_daily.round(2))

    print("\n=== Person-Level Performance ===")
    print(f"Average Person-Level Accuracy: {np.mean(person_acc_scores):.2f}")
    print(f"Average Person-Level F1-score: {np.mean(person_f1_scores):.2f}")
    print("=== Average Person-Level Classification Report ===")
    print(avg_class_report_person.round(2))

    print("=" * 60)

    final_models[name] = model

### Model: Lasso Regression ###

=== Daily Performance ===
Average Daily Accuracy: 0.54
Average Daily F1-score: 0.50
=== Average Daily Classification Report ===
              precision  recall  f1-score  support
0                  0.60    0.77      0.67  1927.00
1                  0.35    0.22      0.27  1117.67
2                  0.21    0.06      0.09   190.33
accuracy           0.54    0.54      0.54     0.54
macro avg          0.39    0.35      0.35  3235.00
weighted avg       0.50    0.54      0.50  3235.00

=== Person-Level Performance ===
Average Person-Level Accuracy: 0.57
Average Person-Level F1-score: 0.51
=== Average Person-Level Classification Report ===
              precision  recall  f1-score  support
0                  0.61    0.86      0.71    28.33
1                  0.35    0.17      0.23    15.67
2                  0.00    0.00      0.00     3.00
accuracy           0.57    0.57      0.57     0.57
macro avg          0.32    0.34      0.32    47.00
weighted avg       

In [128]:
# cv 는 학습에 안 쓰이는 폴드가 있으므로 전체 데이터로 학습을 다시 진행
def learn_model(name, model, X, y):
    model.fit(X, y)
    return name, model

for name, model in models.items():
    name_, model_ = learn_model(name, model, X_train_scaled, y_train_encoded)
    final_models[name] = model_

# Test dataset 예측 결과

In [129]:
X_test, y_test, test_emails = merge_and_process_data(test_activity, test_sleep, test_label)

57.8% 압축됨
전처리 후 피처 수: 51
전처리 후 사용되는 피처: Index(['activity_average_met', 'activity_cal_active', 'activity_cal_total',
       'activity_daily_movement', 'activity_high', 'activity_inactive',
       'activity_inactivity_alerts', 'activity_low', 'activity_medium',
       'activity_met_min_high', 'activity_met_min_inactive',
       'activity_met_min_low', 'activity_met_min_medium', 'activity_non_wear',
       'activity_rest', 'activity_score', 'activity_score_meet_daily_targets',
       'activity_score_move_every_hour', 'activity_score_recovery_time',
       'activity_score_stay_active', 'activity_score_training_frequency',
       'activity_score_training_volume', 'activity_steps', 'activity_total',
       'sleep_awake', 'sleep_breath_average', 'sleep_deep', 'sleep_duration',
       'sleep_efficiency', 'sleep_hr_average', 'sleep_hr_lowest',
       'sleep_is_longest', 'sleep_light', 'sleep_midpoint_at_delta',
       'sleep_midpoint_time', 'sleep_onset_latency', 'sleep_period_id',
       'slee

## Train data, Test data Exclusivity 확인

In [130]:
set(train_emails) & set(test_emails)

set()

In [131]:
X_test_scaled = scaler.transform(X_test)
y_test_encoded = encode_label(y_test)

In [132]:
for name, model in final_models.items():
    print("###", f"Model: {name}", "###")
    
    y_test_pred = model.predict(X_test_scaled)
    
    # === 일별 성능 평가 ===
    daily_result = evaluate_daily_performance(y_test_encoded, y_test_pred)

    # 일별 성능 지표 추출
    daily_acc = daily_result["accuracy"]
    daily_f1 = daily_result["f1_score"]
    daily_report = daily_result["classification_report"]

    # === 사람별 성능 평가 ===
    person_result = evaluate_person_performance(test_emails, y_test_encoded, y_test_pred)

    # 사람별 성능 지표 추출
    person_acc = person_result["accuracy"]
    person_f1 = person_result["f1_score"]
    person_report = person_result["classification_report"]

    # 결과 출력
    print("\n=== Daily Performance ===")
    print(f"Test Accuracy (by Day): {daily_acc:.2f}")
    print(f"Test F1-score (by Day): {daily_f1:.2f}")
    print("Test Classification Report (by Day):\n", daily_report.round(2))

    print("\n=== Person-Level Performance ===")
    print(f"Test Accuracy (by EMAIL): {person_acc:.2f}")
    print(f"Test F1-score (by EMAIL): {person_f1:.2f}")
    print("Test Classification Report (by EMAIL):\n", person_report.round(2))
    
    print("=" * 60)

### Model: Lasso Regression ###

=== Daily Performance ===
Test Accuracy (by Day): 0.67
Test F1-score (by Day): 0.67
Test Classification Report (by Day):
               precision  recall  f1-score  support
0                  0.79    0.81      0.80  1956.00
1                  0.08    0.11      0.10   308.00
2                  0.85    0.23      0.37   214.00
accuracy           0.67    0.67      0.67     0.67
macro avg          0.57    0.38      0.42  2478.00
weighted avg       0.70    0.67      0.67  2478.00

=== Person-Level Performance ===
Test Accuracy (by EMAIL): 0.73
Test F1-score (by EMAIL): 0.70
Test Classification Report (by EMAIL):
               precision  recall  f1-score  support
0                  0.79    0.88      0.84    26.00
1                  0.00    0.00      0.00     4.00
2                  1.00    0.33      0.50     3.00
accuracy           0.73    0.73      0.73     0.73
macro avg          0.60    0.41      0.45    33.00
weighted avg       0.72    0.73      0.70    3