# 인공지능과 기계학습 11주차 과제

신용카드 고객 데이터를 이용하여 연체 여부 예측하기

## 과제 내용
- 데이터 전처리 및 분할
- 앙상블 기반 모델 파이프라인 구성 및 교차검증
- RandomForest 기반 특징 중요도 분석
- XGBClassifier 하이퍼파라미터 튜닝 및 테스트셋 성능 평가


## 1. 데이터 로드 & 전처리

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import (VotingClassifier, BaggingClassifier, RandomForestClassifier,
                              AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score, classification_report

In [3]:
# 데이터 로드

data = pd.read_csv('/Users/dankim/AIML/UCI_Credit_Card.csv')

In [4]:
# 데이터 전처리

data = data.drop(columns=['ID']) # ID 컬럼 제거

# 특징과 타겟 분리 (타겟은 `default.payment.next.month`)
X = data.drop(columns=['default.payment.next.month'])
y = data['default.payment.next.month']
print(X.info())
print('\n타겟 분포:\n', y.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   LIMIT_BAL  30000 non-null  float64
 1   SEX        30000 non-null  int64  
 2   EDUCATION  30000 non-null  int64  
 3   MARRIAGE   30000 non-null  int64  
 4   AGE        30000 non-null  int64  
 5   PAY_0      30000 non-null  int64  
 6   PAY_2      30000 non-null  int64  
 7   PAY_3      30000 non-null  int64  
 8   PAY_4      30000 non-null  int64  
 9   PAY_5      30000 non-null  int64  
 10  PAY_6      30000 non-null  int64  
 11  BILL_AMT1  30000 non-null  float64
 12  BILL_AMT2  30000 non-null  float64
 13  BILL_AMT3  30000 non-null  float64
 14  BILL_AMT4  30000 non-null  float64
 15  BILL_AMT5  30000 non-null  float64
 16  BILL_AMT6  30000 non-null  float64
 17  PAY_AMT1   30000 non-null  float64
 18  PAY_AMT2   30000 non-null  float64
 19  PAY_AMT3   30000 non-null  float64
 20  PAY_AM

## 2. 데이터 분할

In [5]:
# train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## 3. 파이프라인 구성

In [6]:
# 범주형 / 수치형 변수 분리
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric_features = [col for col in X.columns if col not in categorical_features]

# 전처리 파이프라인 구성
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# 기본 하이퍼파라미터 모델로 구성
models = {
    'Voting': VotingClassifier([
        ('lr', LogisticRegression(max_iter=1000)),
        ('rf', RandomForestClassifier()),
        ('gb', GradientBoostingClassifier())
    ], voting='soft'),

    'Bagging': BaggingClassifier(DecisionTreeClassifier()),

    'RandomForest': RandomForestClassifier(),

    'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier()),

    'GradientBoosting': GradientBoostingClassifier(),

    'XGBoost': XGBClassifier(),  # 하이퍼파라미터 없이 기본 모델

    'Stacking': StackingClassifier(
        estimators=[
            ('ada', AdaBoostClassifier()),
            ('rf', RandomForestClassifier()),
            ('xgb', XGBClassifier())
        ],
        final_estimator=LogisticRegression(max_iter=1000)
    )
}

# 교차검증으로 성능 평가
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print(f'{name} 평균 정확도: {np.mean(scores):.4f}')

Voting 평균 정확도: 0.8199
Bagging 평균 정확도: 0.8034
RandomForest 평균 정확도: 0.8155
AdaBoost 평균 정확도: 0.7648
GradientBoosting 평균 정확도: 0.8208
XGBoost 평균 정확도: 0.8117
Stacking 평균 정확도: 0.8208


## 4. 특징 중요도 확인

In [7]:
# 특징 중요도 확인 (RandomForestClassifier)
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])
rf_pipeline.fit(X_train, y_train)
feature_names_cat = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = np.concatenate([numeric_features, feature_names_cat])
importances = rf_pipeline.named_steps['classifier'].feature_importances_

feature_importances = pd.DataFrame({'Feature': all_features, 'Importance': importances})
print(feature_importances.sort_values(by='Importance', ascending=False).head(10))

      Feature  Importance
2       PAY_0    0.095963
1         AGE    0.064626
0   LIMIT_BAL    0.058387
8   BILL_AMT1    0.057245
9   BILL_AMT2    0.052433
10  BILL_AMT3    0.050371
14   PAY_AMT1    0.049938
13  BILL_AMT6    0.049040
11  BILL_AMT4    0.048856
12  BILL_AMT5    0.048590


## 5. 하이퍼파라미터 튜닝

In [8]:
# XGBClassifier 하이퍼파라미터 튜닝
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(eval_metric='logloss'))
])
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7, 9],
    'classifier__learning_rate': [0.001, 0.01, 0.1],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}
random_search = RandomizedSearchCV( # 각 모델의 성능을 탐색
    xgb_pipeline, param_distributions=param_grid, n_iter=50,
    scoring='accuracy', cv=5, verbose=1, random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)
print('최적 하이퍼파라미터:', random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
최적 하이퍼파라미터: {'classifier__n_estimators': 200, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1, 'classifier__colsample_bytree': 0.8}


## 6. 최적 모델 평가

In [9]:
# 최종 성능 평가 (테스트 세트에 대한)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print('test 정확도:', accuracy_score(y_test, y_pred))
print('\n분류 성능 리포트:\n', classification_report(y_test, y_pred))

test 정확도: 0.8198888888888889

분류 성능 리포트:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89      7040
           1       0.66      0.35      0.46      1960

    accuracy                           0.82      9000
   macro avg       0.75      0.65      0.68      9000
weighted avg       0.80      0.82      0.80      9000

