In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

# ============================================================
# 1. 데이터 경로 설정
#    - 필터된 CSV 파일 경로만 맞게 수정하면 된다.
# ============================================================
BASE_DIR  = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, "datasets", "filtered_251127_labeled_NH-001_301A.csv")
# ↑ 파일 이름만 실제 필터 CSV 이름에 맞게 수정

print(f"[INFO] Loading filtered dataset from: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)

print(f"[INFO] Loaded shape: {df.shape}")
print(df.head())

# ============================================================
# 2. 특징 / 라벨 분리
#    - label 컬럼 이름이 다르면 여기만 맞춰주면 된다.
# ============================================================
if "label" not in df.columns:
    raise ValueError("CSV에 'label' 컬럼이 없다. 라벨 컬럼 이름을 확인해라.")

X = df.drop(columns=["label"])
y = df["label"].astype(int)  # 0/1/2

print(f"[INFO] Features shape: {X.shape}, Labels shape: {y.shape}")
print("[INFO] Label distribution:")
print(y.value_counts().sort_index())

# ============================================================
# 3. Train / Val / Test 분할 (80 / 10 / 10)
# ============================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("\n[INFO] Split shapes:")
print("  X_train:", X_train.shape, " y_train:", y_train.shape)
print("  X_val  :", X_val.shape,   " y_val  :", y_val.shape)
print("  X_test :", X_test.shape,  " y_test :", y_test.shape)

# ============================================================
# 4. 스케일링 (StandardScaler)
# ============================================================
scaler = StandardScaler()
X_train_2d = scaler.fit_transform(X_train).astype(np.float32)
X_val_2d   = scaler.transform(X_val).astype(np.float32)
X_test_2d  = scaler.transform(X_test).astype(np.float32)

print("\n[INFO] Scaled shapes:")
print("  X_train_2d:", X_train_2d.shape)
print("  X_val_2d  :", X_val_2d.shape)
print("  X_test_2d :", X_test_2d.shape)

# ============================================================
# 5. XGBoost 모델 정의 및 학습
#    - 기존 노트북에서 쓰던 하이퍼파라미터와 최대한 비슷하게 맞추면 된다.
# ============================================================
num_classes = len(np.unique(y_train))
print(f"\n[INFO] Number of classes: {num_classes}")

xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1,
)

print("\n[INFO] Training XGBoost on filtered dataset...")
xgb_model.fit(
    X_train_2d,
    y_train,
    eval_set=[(X_train_2d, y_train), (X_val_2d, y_val)],
    verbose=False,
)

print("[INFO] Training done.")

# ============================================================
# 6. 평가 (Train / Val / Test)
# ============================================================
def eval_split(name, X_split, y_split):
    y_pred = xgb_model.predict(X_split)
    acc = accuracy_score(y_split, y_pred)
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    print("Classification report:")
    print(classification_report(y_split, y_pred, digits=4))
    print("Confusion matrix:")
    print(confusion_matrix(y_split, y_pred))

eval_split("Train", X_train_2d, y_train)
eval_split("Val",   X_val_2d,   y_val)
eval_split("Test",  X_test_2d,  y_test)

# ============================================================
# 7. (선택) 모델 / 스케일러 저장
# ============================================================
# 필요하면 주석 해제해서 저장
# from joblib import dump
# model_path  = os.path.join(BASE_DIR, "xgb_filtered_model.joblib")
# scaler_path = os.path.join(BASE_DIR, "xgb_filtered_scaler.joblib")
# dump(xgb_model, model_path)
# dump(scaler, scaler_path)
# print(f"\n[INFO] Saved model to: {model_path}")
# print(f"[INFO] Saved scaler to: {scaler_path}")


[INFO] Loading filtered dataset from: /home/gyuha1118/venvs/capstone/code/datasets/filtered_251127_labeled_NH-001_301A.csv
[INFO] Loaded shape: (4777, 21)
   ESP32-1_mean  ESP32-1_std  ESP32-1_min  ESP32-1_max  ESP32-1_median  \
0        33.875     8.025629         15.0         38.0            38.0   
1        37.625     1.060660         35.0         38.0            38.0   
2        38.000     0.000000         38.0         38.0            38.0   
3        37.875     0.353553         37.0         38.0            38.0   
4        37.625     0.517549         37.0         38.0            38.0   

   ESP32-2_mean  ESP32-2_std  ESP32-2_min  ESP32-2_max  ESP32-2_median  ...  \
0        43.500     7.091242         27.0         47.0            47.0  ...   
1        46.625     0.744024         45.0         47.0            47.0  ...   
2        46.000     2.138090         41.0         47.0            47.0  ...   
3        45.750     2.121320         41.0         47.0            47.0  ...   
4    