In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# read data
# define filepath to read dataset
data_path = './dataset/'

# define column names for easy indexing
index_names = ['unit_number', 'time_in_cycles']
setting_names = ['operational_setting1', 'operational_setting2', 'operational_setting3']
sensor_names = ['s_{}'.format(i) for i in range(1,22)]
col_names = index_names + setting_names + sensor_names

train_FD001 = pd.read_csv((data_path+'train_FD001.txt'), sep=r'\s+', header=None, names=col_names)
RUL_FD001 = pd.read_csv((data_path+'RUL_FD001.txt'), sep=r'\s+', header=None, names=['RUL'])
test_FD001 = pd.read_csv((data_path+'test_FD001.txt'), sep=r'\s+', header=None, names=col_names)

# train_FD001.to_csv('train_FD001.csv', index=False)
# test_FD001.to_csv('test_FD001.csv', index=False)
# RUL_FD001.to_csv('RUL_FD001.csv', index=False)

# 각 unit_number 별 max_cycle 계산
max_cycles = train_FD001.groupby('unit_number')['time_in_cycles'].max().reset_index()
max_cycles.columns = ['unit_number', 'max_cycles']
# train_FD001 에 max_cycles 정보를 병합
train = train_FD001.merge(max_cycles[['unit_number', 'max_cycles']], on='unit_number', how='left')
# 각행의 RUL 계산
train['RUL'] = train['max_cycles'] - train['time_in_cycles']

# 각 unit_number 별 max_cycle 계산
max_cycles_test = test_FD001.groupby('unit_number')['time_in_cycles'].max().reset_index()
max_cycles_test.columns = ['unit_number', 'max_cycles']
# max_cycle 에 RUL_FD001 을 더하여 전체 수명 계산
max_cycles_test['max_life'] = max_cycles_test['max_cycles'] + RUL_FD001['RUL']
# train_FD001 에 max_cycles 정보를 병합
test = test_FD001.merge(max_cycles_test[['unit_number', 'max_life']], on='unit_number', how='left')
# 각행의 RUL 계산
test['RUL'] = test['max_life'] - test['time_in_cycles']

# 데이터셋 복사
train_copy = train
test_copy = test

# 데이터 타입 맞춤
train = train.astype({'s_17': 'float64', 's_18': 'float64'})
test = test.astype({'s_17': 'float64', 's_18': 'float64'})

# 표준 편차가 0인 독립 변수 및 사용하지 않는 칼럼, 상관 분석에서 미미한 칼럼 제거 
# delete_train_columns = ['time_in_cycles', 'max_cycles', 'operational_setting1', 'operational_setting2', 'operational_setting3', 's_1', 's_5', 's_6', 's_10', 's_16', 's_18', 's_19']
delete_train_columns = ['unit_number', 'time_in_cycles', 'max_cycles', 'operational_setting1', 'operational_setting2', 'operational_setting3', 's_1', 's_5', 's_6', 's_10', 's_16', 's_18', 's_19']
for i in delete_train_columns:
    del train[i]

# delete_test_columns = ['time_in_cycles', 'max_life', 'operational_setting1', 'operational_setting2', 'operational_setting3', 's_1', 's_5', 's_6', 's_10', 's_16', 's_18', 's_19']
delete_test_columns = ['unit_number', 'time_in_cycles', 'max_life', 'operational_setting1', 'operational_setting2', 'operational_setting3', 's_1', 's_5', 's_6', 's_10', 's_16', 's_18', 's_19']
for i in delete_test_columns:
    del test[i]

y_train = train['RUL']
X_train = train.drop('RUL', axis=1)

y_test = test['RUL']
X_test = test.drop('RUL', axis=1)

print('X_train size', X_train.shape, ' y_train size', y_train.shape)
print('X_test size', X_test.shape, ' y_test size', y_test.shape)

X_train size (20631, 14)  y_train size (20631,)
X_test size (13096, 14)  y_test size (13096,)


In [26]:
from sklearn.preprocessing import StandardScaler

# Z-스코어 표준화 적용
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [37]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# 스코어 함수 선언 asymmetric_scoring 함수는 조기 예측과 늦은 예측에 대해 서로 다른 가중치를 적용하여 스코어를 계산한다. (조기 예측 가중치 a1, 늦은 예측 가중치 a2)
def asymmetric_scoring(y_true, y_pred, a1=10, a2=13):
    """
    비대칭 스코어링 함수
    y_true: 실제 RUL 값 (numpy array)
    y_pred: 예측된 RUL 값 (numpy array)
    a1: 조기 예측에 대한 가중치
    a2: 늦은 예측에 대한 가중치
    """
    errors = y_pred - y_true
    scores = np.where(errors < 0, np.exp(-errors / a1) - 1, np.exp(errors / a2) - 1)
    return np.sum(scores)

# evaluate_algorithm 함수는 여러 UUT 에 대해 총 스코어를 계산한다.
def evaluate_algorithm(y_true_all, y_pred_all, a1=10, a2=13):
    """
    알고리즘 평가 함수
    y_true_all: 실제 RUL 값 리스트 (각 UUT별 numpy array)
    y_pred_all: 예측된 RUL 값 리스트 (각 UUT별 numpy array)
    a1: 조기 예측에 대한 가중치
    a2: 늦은 예측에 대한 가중치
    """
    total_score = 0

    for y_true, y_pred in zip(y_true_all, y_pred_all):
        score = asymmetric_scoring(y_true, y_pred, a1, a2)
        total_score += score
    
    return total_score

# 하이퍼파라미터 그리드 정의
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.03, 0.1, 0.2],
    'max_depth': [4, 6, 8, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# XGBoost 모델 생성
xgb_model = XGBRegressor(random_state=42)

# RandomizedSearchCV 사용
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터
print("Best parameters found: ", random_search.best_params_)

# 최적 모델로 예측
best_xgb = random_search.best_estimator_
pred_xgb = best_xgb.predict(X_test)

# 성능 평가
total_score = evaluate_algorithm(y_test, pred_xgb)
mse_xgb = mean_squared_error(y_test, pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, pred_xgb)
r2_xgb = r2_score(y_test, pred_xgb)

print(f"XGBoost 최적화 후 Score: {total_score}")
print(f"XGBoost 최적화 후 MSE: {mse_xgb}")
print(f"XGBoost 최적화 후 MAPE: {mape_xgb}")
print(f"XGBoost 최적화 후 R2: {r2_xgb}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found:  {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
XGBoost 최적화 후 Score: 9414289728.272392
XGBoost 최적화 후 MSE: 2085.989113273502
XGBoost 최적화 후 MAPE: 0.2832160319363842
XGBoost 최적화 후 R2: 0.4003000855445862
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=4, n_estimators=500, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=8, n_estimators=200, subsample=1.0; total time=   1.6s
[CV] END colsample_bytree=0.8, learning_rate=0.2, max_depth=4, n_estimators=500, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=300, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=8, n_estimators=200,