- 1차 모델 : 7가지 결함 분류
- RandomOverSampler로 소수 데이터 증강
- XGB 모델 구축

# Library

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.model_selection import train_test_split

# Model
from xgboost import XGBClassifier

# Score
from sklearn.metrics import classification_report

# Class Imbalance import
from imblearn.over_sampling import RandomOverSampler

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
import tqdm

In [24]:
# Seed 고정
seed = 1
np.random.seed(seed)

# Load Data

In [25]:
df_org = pd.read_csv('data/multi_classification_data.csv')
df_org.shape

(1941, 34)

In [26]:
# 14번째부터 27번째 컬럼 삭제
columns_to_drop = [i for i in range(14, 27)]
df = df_org.drop(columns=df_org.columns[columns_to_drop])

# Empty_Index 추가
df = pd.concat([df, df_org[['Empty_Index']]], axis=1)

In [27]:
encoding_list = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# 오디날 인코딩 수행
df['Type_of_Steel'] = df[encoding_list].idxmax(axis=1).apply(lambda x: encoding_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df = df.drop(encoding_list, axis=1)

# 결과 출력
print(df['Type_of_Steel'])

0       0
1       0
2       0
3       1
4       1
       ..
1936    1
1937    1
1938    1
1939    1
1940    0
Name: Type_of_Steel, Length: 1941, dtype: int64


In [28]:
# Target column에 대한 인코딩 수행

target_list = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
df['Target'] = df[target_list].idxmax(axis=1).apply(lambda x: target_list.index(x))
df = df.drop(target_list, axis=1) # 인코딩된 컬럼 삭제

In [29]:
# 행들을 뒤죽박죽으로 섞기
df = df.sample(frac=1, random_state=42)
print(df['Target'])

1605    6
1502    6
70      0
976     5
1052    5
       ..
1130    5
1294    6
860     4
1459    6
1126    5
Name: Target, Length: 1941, dtype: int64


### 데이터셋 분리 (train / val / test)

In [30]:
# target / feature 분리

target = 'Target'
x = df.drop(target, axis=1)
y = df[target]

print(f'x shape : {x.shape}')
print(f'y shape : {y.shape}')

x shape : (1941, 14)
y shape : (1941,)


In [31]:
## 데이터셋 분리 (train / val / test)# train / val / test 분리

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

print(f'train data : x{x_train.shape}, y{y_train.shape}')
print(f'val data : x{x_val.shape}, y{y_val.shape}')
print(f'test data : x{x_test.shape}, y{y_test.shape}')

train data : x(1241, 14), y(1241,)
val data : x(311, 14), y(311,)
test data : x(389, 14), y(389,)


# Modiling

In [48]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [49]:
# val set score
y_val_pred = xgb.predict(x_val)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           0       0.82      0.50      0.62        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       1.00      1.00      1.00        12
           4       1.00      0.88      0.93         8
           5       0.84      0.75      0.79        75
           6       0.73      0.89      0.80       108

    accuracy                           0.84       311
   macro avg       0.90      0.84      0.86       311
weighted avg       0.85      0.84      0.83       311



## Class Imbalance 해결

In [34]:
o_sampler = RandomOverSampler(random_state=1)
x_train_o, y_train_o = o_sampler.fit_resample(x_train, y_train)
xgb.fit(x_train_o, y_train_o)

In [35]:
# val set score after sampling
y_val_pred = xgb.predict(x_val)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           0       0.87      0.46      0.60        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       1.00      1.00      1.00        12
           4       0.89      1.00      0.94         8
           5       0.80      0.79      0.79        75
           6       0.77      0.88      0.82       108

    accuracy                           0.84       311
   macro avg       0.89      0.86      0.86       311
weighted avg       0.85      0.84      0.84       311



- sampling을 통해 전반적인 accuracy가 증가하지는 못함
- 하지만 전체 클래스에 대해 precision 개선

# 하이퍼파라미터 튜닝

In [60]:
# 튜닝할 하이퍼파라미터와 탐색 범위 설정
param_dist = {
    'learning_rate': np.linspace(0.01, 0.3, num=10),
    'max_depth': np.arange(10, 50),
    'min_child_weight': np.arange(1, 6),
    'n_estimators': np.arange(10, 201, 50)
}

# RandomizedSearchCV를 사용한 랜덤 서치
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=50, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)
random_search.fit(x_train_o, y_train_o)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters found: ", random_search.best_params_)


Best hyperparameters found:  {'n_estimators': 110, 'min_child_weight': 2, 'max_depth': 31, 'learning_rate': 0.20333333333333334}


In [62]:
# val set score after tuning
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(x_val)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           0       0.79      0.54      0.64        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       0.92      1.00      0.96        12
           4       0.89      1.00      0.94         8
           5       0.77      0.72      0.74        75
           6       0.76      0.87      0.81       108

    accuracy                           0.83       311
   macro avg       0.86      0.85      0.85       311
weighted avg       0.83      0.83      0.83       311



# 최종 모델 평가

In [63]:
# test set score after tuning
y_test_pred = best_model.predict(x_test)
test_report = classification_report(y_test, y_test_pred)
print(test_report)

              precision    recall  f1-score   support

           0       0.60      0.56      0.58        27
           1       0.90      0.98      0.94        46
           2       0.96      0.98      0.97        89
           3       0.92      1.00      0.96        11
           4       0.80      0.73      0.76        11
           5       0.69      0.59      0.64        81
           6       0.70      0.74      0.72       124

    accuracy                           0.79       389
   macro avg       0.79      0.80      0.79       389
weighted avg       0.78      0.79      0.78       389



# 모델 저장

In [64]:
import joblib
joblib.dump(xgb, 'steelplate_model1.pkl')

['steelplate_model1.pkl']