- 1, 2차 모델 분류 결과 class 5, 6에 해당하는 데이터 처리
- 3차 모델 : class 5 또는 class 6으로 분류
- sampling 후 오히려 성능 떨어지기에 사용하지 않음
- RandomForest 모델 구축

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score

# Class Imbalance import
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [2]:
# Seed 고정
seed = 1
np.random.seed(seed)

In [3]:
# 원본 데이터 준비
df_org = pd.read_csv('data/multi_classification_data.csv')
df_org.shape

(1941, 34)

In [4]:
encoding_list = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# 오디날 인코딩 수행
df_org['Type_of_Steel'] = df_org[encoding_list].idxmax(axis=1).apply(lambda x: encoding_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df_org = df_org.drop(encoding_list, axis=1)

# 결과 출력
print(df_org['Type_of_Steel'])

0       0
1       0
2       0
3       1
4       1
       ..
1936    1
1937    1
1938    1
1939    1
1940    0
Name: Type_of_Steel, Length: 1941, dtype: int64


In [5]:
target_list = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# 오디날 인코딩 수행
df_org['Target'] = df_org[target_list].idxmax(axis=1).apply(lambda x: target_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df_org = df_org.drop(target_list, axis=1)

# 결과 출력
print(df_org['Target'])

0       0
1       0
2       0
3       0
4       0
       ..
1936    6
1937    6
1938    6
1939    6
1940    6
Name: Target, Length: 1941, dtype: int64


In [7]:
# Index / log 컬럼 삭제 데이터 준비
print(df_org.columns[12:25])

# Index, Log 컬럼 삭제 
# 1차 모델과 달리 'Empty_Index' 컬럼 추가 안함 (성능이 더 떨어지기 때문에)
columns_to_drop = [i for i in range(12, 25)]
df = df_org.drop(columns=df_org.columns[columns_to_drop])

# 'Target' 컬럼이 5 또는 6인 행만 추출
df = df[(df['Target'] == 5) | (df['Target'] == 6)]

Index(['Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas'],
      dtype='object')


In [8]:
# 행들을 뒤죽박죽으로 섞기
df = df.sample(frac=1, random_state=42)

### 데이터셋 분리 (train / val / test)

In [9]:
# target / feature 분리
target = 'Target'
x = df.drop(target, axis=1)
y = df[target]

In [10]:
## 데이터셋 분리 (train / val / test)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=1)

print(f'train data : x{x_train.shape}, y{y_train.shape}')
print(f'val data : x{x_val.shape}, y{y_val.shape}')
print(f'test data : x{x_test.shape}, y{y_test.shape}')


train data : x(870, 13), y(870,)
val data : x(97, 13), y(97,)
test data : x(108, 13), y(108,)


# Modeling

In [13]:
# RandomForestClassifier
rf = RandomForestClassifier(random_state = 1)
rf.fit(x_train, y_train)

In [14]:
# val set score
y_val_pred = rf.predict(x_val)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           5       0.80      0.76      0.78        37
           6       0.85      0.88      0.87        60

    accuracy                           0.84        97
   macro avg       0.83      0.82      0.82        97
weighted avg       0.83      0.84      0.83        97



# 하이퍼파라미터 튜닝

In [32]:
# 튜닝할 하이퍼파라미터와 탐색 범위 설정
param_dist = {
    'n_estimators': np.arange(50, 201, 10),
    'max_depth': np.arange(3, 15),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
}

# RandomizedSearchCV를 사용한 랜덤 서치
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=50, scoring='accuracy', cv=3, n_jobs=-1, random_state=42)
random_search.fit(x_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best hyperparameters found: ", random_search.best_params_)

Best hyperparameters found:  {'n_estimators': 140, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 10}


In [33]:
# val set score after tuning
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(x_val)
val_report = classification_report(y_val, y_val_pred)
print(val_report)

              precision    recall  f1-score   support

           5       0.79      0.73      0.76        37
           6       0.84      0.88      0.86        60

    accuracy                           0.82        97
   macro avg       0.82      0.81      0.81        97
weighted avg       0.82      0.82      0.82        97



# 최종 모델 평가

In [None]:
# test set score after tuning
y_test_pred = best_model.predict(x_test)
test_report = classification_report(y_test, y_test_pred)
print(test_report)

# 모델 저장

In [34]:
import joblib
joblib.dump(rf, 'steelplate_model3.pkl')

['steelplate_model3.pkl']