<a href="https://colab.research.google.com/github/erika0915/pattern-recognition/blob/main/05_15_%ED%95%99%EC%8A%B5%EB%AA%A8%EB%8D%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

X_train_final = pd.read_csv('/content/drive/MyDrive/패턴인식/X_train_final.csv')
y_train_final = pd.read_csv('/content/drive/MyDrive/패턴인식/y_train_final.csv')
X_val = pd.read_csv('/content/drive/MyDrive/패턴인식/X_val.csv')
y_val = pd.read_csv('/content/drive/MyDrive/패턴인식/y_val.csv')
X_test_scaled = pd.read_csv('/content/drive/MyDrive/패턴인식/X_test_scaled.csv')

# y를 Series로 변환
y_train_final = y_train_final.squeeze()
y_val = y_val.squeeze()

print("데이터 불러오기 완료!")

Mounted at /content/drive
데이터 불러오기 완료!


# 튜닝 전 모델

## 1. LightGDM


In [None]:
# LightGBM 모델 생성 및 학습
lgbm_model = LGBMClassifier(
    random_state=42,
    n_estimators=100,
    learning_rate=0.1
)
lgbm_model.fit(X_train_final, y_train_final)

# 검증 데이터 예측 및 평가
y_val_pred_lgbm = lgbm_model.predict(X_val)
y_val_proba_lgbm = lgbm_model.predict_proba(X_val)[:, 1]

val_acc_lgbm = accuracy_score(y_val, y_val_pred_lgbm)
val_f1_lgbm = f1_score(y_val, y_val_pred_lgbm)
val_auc_lgbm = roc_auc_score(y_val, y_val_proba_lgbm)
val_avg_lgbm = (val_acc_lgbm + val_f1_lgbm + val_auc_lgbm) / 3

print("\n LightGBM 검증 성능")
print(f" Accuracy: {val_acc_lgbm:.4f}")
print(f" F1 Score: {val_f1_lgbm:.4f}")
print(f" AUC: {val_auc_lgbm:.4f}")
print(f" 최종 평균 점수: {val_avg_lgbm:.4f}")

# 테스트 데이터 예측
test_preds = lgbm_model.predict(X_test_scaled)
unique, counts = np.unique(test_preds, return_counts=True)
pred_counts = dict(zip(unique, counts))

print("\n Test 데이터 예측 결과:")
print(f"Class 0 개수: {pred_counts.get(0, 0)}")
print(f"Class 1 개수: {pred_counts.get(1, 0)}")

[LightGBM] [Info] Number of positive: 8803, number of negative: 8957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8344
[LightGBM] [Info] Number of data points in the train set: 17760, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495664 -> initscore=-0.017343
[LightGBM] [Info] Start training from score -0.017343

 LightGBM 검증 성능
 Accuracy: 0.6610
 F1 Score: 0.6577
 AUC: 0.7238
 최종 평균 점수: 0.6808

 Test 데이터 예측 결과:
Class 0 개수: 4881
Class 1 개수: 4634


## 2. 랜덤포레스트

In [None]:
# RandomForest 모델 생성 및 학습
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_final, y_train_final)

# 검증 데이터 예측 및 평가
y_val_pred_rf = rf_model.predict(X_val)
y_val_proba_rf = rf_model.predict_proba(X_val)[:, 1]

val_acc_rf = accuracy_score(y_val, y_val_pred_rf)
val_f1_rf = f1_score(y_val, y_val_pred_rf)
val_auc_rf = roc_auc_score(y_val, y_val_proba_rf)
val_avg_rf = (val_acc_rf + val_f1_rf + val_auc_rf) / 3

print("\n RandomForest 검증 성능")
print(f" Accuracy: {val_acc_rf:.4f}")
print(f" F1 Score: {val_f1_rf:.4f}")
print(f" AUC: {val_auc_rf:.4f}")
print(f" 최종 평균 점수: {val_avg_rf:.4f}")

# 테스트 데이터 예측
test_preds_rf = rf_model.predict(X_test_scaled)
unique_rf, counts_rf = np.unique(test_preds_rf, return_counts=True)
pred_counts_rf = dict(zip(unique_rf, counts_rf))

print("\n Test 데이터 예측 결과:")
print(f"Class 0 개수: {pred_counts_rf.get(0, 0)}")
print(f"Class 1 개수: {pred_counts_rf.get(1, 0)}")


 RandomForest 검증 성능
 Accuracy: 0.6637
 F1 Score: 0.6647
 AUC: 0.7218
 최종 평균 점수: 0.6834

 Test 데이터 예측 결과:
Class 0 개수: 4773
Class 1 개수: 4742


# 튜닝 모델
1. LightGBM 튜닝
- `n_estimators`, `learning_rate`, `max_depth`, `num_leaves`, `min_child_samples`, `subsample` 6개 하이퍼파라미터를 랜덤하게 조합해서 실험
- RandomizedSearchCV를 사용해서 20번(=n_iter=20) 조합 시도함
- 3-fold 교차검증으로 각각의 조합 성능 평가 (ROC AUC 기준)
- 제일 좋은 조합(best_params_)을 가진 best_lgbm 모델을 얻음.

2. RandomForest
- 튜닝 없이 기본 모델을 사용함

3. 평가
</br>
각각 best_lgbm, 기본 RandomRorest 모델을 검증 데이터(X_val, y_val)에서 평가
Accuracy, F1 score, AUC를 계산하고, 이 세 개를 평균(Avg) 내서 최종 점수를 출력함.

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# LightGBM 튜닝할 파라미터 범위 정의
lgbm_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9, 12],
    'num_leaves': [20, 31, 50],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.6, 0.8, 1.0],
}

# LightGBM RandomizedSearchCV 설정
lgbm_random_search = RandomizedSearchCV(
    LGBMClassifier(random_state=42),
    param_distributions=lgbm_params,
    n_iter=20,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# LightGBM 튜닝 및 최적 모델 저장
lgbm_random_search.fit(X_train_final, y_train_final)
best_lgbm = lgbm_random_search.best_estimator_
print("Best Parameters (LightGBM):", lgbm_random_search.best_params_)

# RandomForest 기본 모델 사용
best_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train_final, y_train_final)

# 최적 모델 평가 함수 정의
def evaluate(model, X_val, y_val, model_name="Model"):
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)
    avg = (acc + f1 + auc) / 3

    print(f"\n{model_name} 검증 성능")
    print(f" Accuracy: {acc:.4f}")
    print(f" F1 Score: {f1:.4f}")
    print(f" AUC: {auc:.4f}")
    print(f" 최종 평균 점수: {avg:.4f}")

# LightGBM 평가
evaluate(best_lgbm, X_val, y_val, model_name="최적 LGBM")

# RandomForest 평가
evaluate(best_rf, X_val, y_val, model_name="기본 RandomForest")


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 8803, number of negative: 8957
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8344
[LightGBM] [Info] Number of data points in the train set: 17760, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495664 -> initscore=-0.017343
[LightGBM] [Info] Start training from score -0.017343
Best Parameters (LightGBM): {'subsample': 0.8, 'num_leaves': 20, 'n_estimators': 300, 'min_child_samples': 30, 'max_depth': 7, 'learning_rate': 0.05}

최적 LGBM 검증 성능
 Accuracy: 0.6617
 F1 Score: 0.6561
 AUC: 0.7240
 최종 평균 점수: 0.6806

기본 RandomForest 검증 성능
 Accuracy: 0.6637
 F1 Score: 0.6647
 AUC: 0.7218
 최종 평균 점수: 0.6834


# Soft Voting

In [5]:
from sklearn.ensemble import VotingClassifier

# Soft Voting 앙상블 모델 생성
voting_clf = VotingClassifier(
    estimators=[('lgbm', best_lgbm), ('rf', best_rf)],
    voting='soft',  # 확률(Soft) Voting
    n_jobs=-1
)

# 앙상블 모델 학습
voting_clf.fit(X_train_final, y_train_final)

# 검증 데이터 예측
y_val_proba_voting = voting_clf.predict_proba(X_val)[:, 1]
y_val_pred_voting = (y_val_proba_voting >= 0.5).astype(int)  # 기본 임계값 0.5

# 검증 데이터 성능 평가
voting_acc = accuracy_score(y_val, y_val_pred_voting)
voting_f1 = f1_score(y_val, y_val_pred_voting)
voting_auc = roc_auc_score(y_val, y_val_proba_voting)
voting_avg = (voting_acc + voting_f1 + voting_auc) / 3

print("\n Soft Voting 앙상블 검증 성능")
print(f" Accuracy: {voting_acc:.4f}")
print(f" F1 Score: {voting_f1:.4f}")
print(f" AUC: {voting_auc:.4f}")
print(f" 최종 평균 점수: {voting_avg:.4f}")

# 테스트 데이터 예측
test_proba_voting = voting_clf.predict_proba(X_test_scaled)[:, 1]
test_pred_voting = (test_proba_voting >= 0.5).astype(int)

# Test 데이터 예측 분포 확인
unique_test, counts_test = np.unique(test_pred_voting, return_counts=True)
pred_counts_test = dict(zip(unique_test, counts_test))

print("\n Test 데이터 예측 결과 (Soft Voting)")
print(f"Class 0 개수: {pred_counts_test.get(0, 0)}")
print(f"Class 1 개수: {pred_counts_test.get(1, 0)}")



 Soft Voting 앙상블 검증 성능
 Accuracy: 0.6696
 F1 Score: 0.6668
 AUC: 0.7281
 최종 평균 점수: 0.6882

 Test 데이터 예측 결과 (Soft Voting)
Class 0 개수: 4830
Class 1 개수: 4685


# Threshold 튜닝 (최적 임계값 찾기)


In [6]:
from sklearn.metrics import precision_recall_curve

# Threshold 튜닝 함수 정의
def find_best_threshold(y_true, y_probs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)  # 작은 수 더해서 0으로 나누기 방지
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    best_f1 = f1_scores[best_idx]
    print(f"\n Best Threshold 찾기 완료")
    print(f" 최적 Threshold: {best_threshold:.4f}")
    print(f" 최적 F1 Score: {best_f1:.4f}")
    return best_threshold

# Voting 모델의 검증 데이터 확률 기반으로 최적 Threshold 찾기
best_threshold = find_best_threshold(y_val, y_val_proba_voting)

# 최적 Threshold를 적용해서 재예측
y_val_pred_voting_best = (y_val_proba_voting >= best_threshold).astype(int)

# 최적 Threshold 적용 후 성능 재평가
voting_acc_best = accuracy_score(y_val, y_val_pred_voting_best)
voting_f1_best = f1_score(y_val, y_val_pred_voting_best)
voting_auc_best = roc_auc_score(y_val, y_val_proba_voting)
voting_avg_best = (voting_acc_best + voting_f1_best + voting_auc_best) / 3

print("\n 최적 Threshold 적용 Soft Voting 검증 성능")
print(f" Accuracy: {voting_acc_best:.4f}")
print(f" F1 Score: {voting_f1_best:.4f}")
print(f" AUC: {voting_auc_best:.4f}")
print(f" 최종 평균 점수: {voting_avg_best:.4f}")

# 테스트 데이터 예측 - 최적 threshold 사용
test_pred_voting_best = (test_proba_voting >= best_threshold).astype(int)

# Test 데이터 예측 분포 확인
unique_test_best, counts_test_best = np.unique(test_pred_voting_best, return_counts=True)
pred_counts_test_best = dict(zip(unique_test_best, counts_test_best))

print("\n Test 데이터 예측 결과 (최적 Threshold 적용)")
print(f"Class 0 개수: {pred_counts_test_best.get(0, 0)}")
print(f"Class 1 개수: {pred_counts_test_best.get(1, 0)}")


 Best Threshold 찾기 완료
 최적 Threshold: 0.3192
 최적 F1 Score: 0.6956

 최적 Threshold 적용 Soft Voting 검증 성능
 Accuracy: 0.5993
 F1 Score: 0.6956
 AUC: 0.7281
 최종 평균 점수: 0.6744

 Test 데이터 예측 결과 (최적 Threshold 적용)
Class 0 개수: 1781
Class 1 개수: 7734
