In [19]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score
import zipfile

# 압축 해제
zip_file_path = '/content/motor_data.zip'
destination_folder = './motor_data'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

# 데이터 경로 설정
motor_data_path = os.path.join(destination_folder, 'motor_data')
normal_dir = os.path.join(motor_data_path, 'normal')
abnormal_dir = os.path.join(motor_data_path, 'abnormal')

# 통계적 특징 추출 함수
def extract_statistical_features(data, slice_size=256):
    features = []
    for i in range(0, len(data), slice_size):
        slice_data = data[i:i + slice_size]

        # 슬라이스 크기가 정확히 slice_size인지 확인
        if len(slice_data) == slice_size:
            # 통계적 특징 추출
            mean = np.mean(slice_data)
            variance = np.var(slice_data)
            rms = np.sqrt(np.mean(slice_data**2))  # Root Mean Square

            # feature 추가
            features.append([mean, variance, rms])

    return np.array(features)

# 데이터 로드 및 통계적 특징 추출
def data_loading_and_feature_extraction(normal_dir, abnormal_dir, slice_size=256):
    data = []
    labels = []

    for label, directory in enumerate([normal_dir, abnormal_dir]):
        for motor in sorted(os.listdir(directory)):
            motor_path = os.path.join(directory, motor, 'time_vib')
            if os.path.exists(motor_path):
                for file in sorted(os.listdir(motor_path)):
                    if file.endswith('.csv'):
                        file_path = os.path.join(motor_path, file)
                        try:
                            # CSV 데이터 읽기
                            df = pd.read_csv(file_path)
                            if df.empty:
                                continue

                            # 진폭 데이터에서 통계적 특징 추출
                            amplitude_data = df.iloc[:, 1].values
                            features = extract_statistical_features(amplitude_data, slice_size)

                            # 데이터와 레이블 추가
                            data.extend(features)
                            labels.extend([label] * len(features))

                        except Exception as e:
                            print(f"Error processing {file_path}: {e}")

    return np.array(data), np.array(labels)

# 데이터 로드
data, labels = data_loading_and_feature_extraction(normal_dir, abnormal_dir, slice_size=256)

# Train/Test 데이터 분리
def split_train_test_by_motor(data, labels, train_motors, test_motors, motor_count=20, slice_per_motor=None):
    if slice_per_motor is None:
        slice_per_motor = len(data) // motor_count  # 모터당 슬라이스 수

    train_data, train_labels = [], []
    test_data, test_labels = [], []

    for motor in train_motors:
        start_idx = (motor - 1) * slice_per_motor
        end_idx = motor * slice_per_motor

        train_data.extend(data[start_idx:end_idx])
        train_labels.extend(labels[start_idx:end_idx])

    for motor in test_motors:
        start_idx = (motor - 1) * slice_per_motor
        end_idx = motor * slice_per_motor

        test_data.extend(data[start_idx:end_idx])
        test_labels.extend(labels[start_idx:end_idx])

    return (
        np.array(train_data), np.array(train_labels),
        np.array(test_data), np.array(test_labels)
    )

# Train/Test 분리 기준 설정
train_motors = range(1, 16)  # 모터 1~15: Train
test_motors = range(16, 21)  # 모터 16~20: Test

train_data, train_labels, test_data, test_labels = split_train_test_by_motor(
    data, labels, train_motors, test_motors, motor_count=20
)

# 데이터 정규화
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)


# XGBoost 모델 초기화
model = XGBClassifier(
    n_estimators=100,            # 트리 개수
    max_depth=6,                 # 트리의 최대 깊이
    learning_rate=0.1,           # 학습률
    subsample=0.8,               # 샘플링 비율
    colsample_bytree=0.8,        # 특성 샘플링 비율
    eval_metric='logloss',       # 평가 지표 설정 (여기서 설정)
)

# 모델 학습
model.fit(
    train_data,
    train_labels,
    eval_set=[(train_data, train_labels), (test_data, test_labels)],  # 학습 및 검증 데이터 설정
    verbose=True                      # 학습 과정 출력
)

# 예측
predictions = model.predict(test_data)

# 성능 평가
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")



[0]	validation_0-logloss:0.55016	validation_1-logloss:0.91134
[1]	validation_0-logloss:0.48237	validation_1-logloss:0.78589
[2]	validation_0-logloss:0.42709	validation_1-logloss:0.68808
[3]	validation_0-logloss:0.38092	validation_1-logloss:0.60905
[4]	validation_0-logloss:0.34214	validation_1-logloss:0.54431
[5]	validation_0-logloss:0.30873	validation_1-logloss:0.49030
[6]	validation_0-logloss:0.28009	validation_1-logloss:0.44486
[7]	validation_0-logloss:0.25506	validation_1-logloss:0.40652
[8]	validation_0-logloss:0.23326	validation_1-logloss:0.37345
[9]	validation_0-logloss:0.21425	validation_1-logloss:0.34481
[10]	validation_0-logloss:0.19760	validation_1-logloss:0.32000
[11]	validation_0-logloss:0.18285	validation_1-logloss:0.29877
[12]	validation_0-logloss:0.16986	validation_1-logloss:0.28038
[13]	validation_0-logloss:0.15812	validation_1-logloss:0.26446
[14]	validation_0-logloss:0.14772	validation_1-logloss:0.25035
[15]	validation_0-logloss:0.13866	validation_1-logloss:0.23802
[1