- Trial 15, 16 모델 합치기

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score

# Class Imbalance import
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV
from tqdm import tqdm

In [81]:
# Seed 고정
seed = 1
np.random.seed(seed)

In [82]:
df_org = pd.read_csv('data/mulit_classification_data.csv')
df_org.shape

(1941, 34)

In [83]:
# 14번째부터 27번째 컬럼 삭제
columns_to_drop = [i for i in range(14, 27)]
df = df_org.drop(columns=df_org.columns[columns_to_drop])
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,1,0,80,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,1,0,80,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,1,0,100,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0,1,290,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0,1,185,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,249,277,325780,325796,273,54,22,35033,119,141,...,0,1,40,0,0,0,0,0,0,1
1937,144,175,340581,340598,287,44,24,34599,112,133,...,0,1,40,0,0,0,0,0,0,1
1938,145,174,386779,386794,292,40,22,37572,120,140,...,0,1,40,0,0,0,0,0,0,1
1939,137,170,422497,422528,419,97,47,52715,117,140,...,0,1,40,0,0,0,0,0,0,1


In [84]:
df = pd.concat([df, df_org[['Empty_Index']]], axis=1)
df

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,TypeOfSteel_A400,Steel_Plate_Thickness,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults,Empty_Index
0,42,50,270900,270944,267,17,44,24220,76,108,...,0,80,1,0,0,0,0,0,0,0.2415
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0,80,1,0,0,0,0,0,0,0.3793
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0,100,1,0,0,0,0,0,0,0.3426
3,853,860,369370,369415,176,13,45,18996,99,126,...,1,290,1,0,0,0,0,0,0,0.4413
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,1,185,1,0,0,0,0,0,0,0.4486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,249,277,325780,325796,273,54,22,35033,119,141,...,1,40,0,0,0,0,0,0,1,0.3906
1937,144,175,340581,340598,287,44,24,34599,112,133,...,1,40,0,0,0,0,0,0,1,0.4554
1938,145,174,386779,386794,292,40,22,37572,120,140,...,1,40,0,0,0,0,0,0,1,0.3287
1939,137,170,422497,422528,419,97,47,52715,117,140,...,1,40,0,0,0,0,0,0,1,0.5904


In [85]:
encoding_list = ['TypeOfSteel_A300', 'TypeOfSteel_A400']

# 오디날 인코딩 수행
df['Type_of_Steel'] = df[encoding_list].idxmax(axis=1).apply(lambda x: encoding_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df = df.drop(encoding_list, axis=1)

# 결과 출력
print(df['Type_of_Steel'])

0       0
1       0
2       0
3       1
4       1
       ..
1936    1
1937    1
1938    1
1939    1
1940    0
Name: Type_of_Steel, Length: 1941, dtype: int64


In [86]:
target_list = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# 오디날 인코딩 수행
df['Target'] = df[target_list].idxmax(axis=1).apply(lambda x: target_list.index(x))

# 오디날 인코딩된 컬럼 삭제
df = df.drop(target_list, axis=1)

# 결과 출력
print(df['Target'])

0       0
1       0
2       0
3       0
4       0
       ..
1936    6
1937    6
1938    6
1939    6
1940    6
Name: Target, Length: 1941, dtype: int64


In [87]:
# 행들을 뒤죽박죽으로 섞기
df = df.sample(frac=1, random_state=42)
print(df['Target'])

1605    6
1502    6
70      0
976     5
1052    5
       ..
1130    5
1294    6
860     4
1459    6
1126    5
Name: Target, Length: 1941, dtype: int64


### 데이터셋 분리 (train / val / test)

In [88]:
# target / feature 분리

target = 'Target'
x = df.drop(target, axis=1)
y = df[target]

print(f'x shape : {x.shape}')
print(f'y shape : {y.shape}')

x shape : (1941, 14)
y shape : (1941,)


In [89]:
## 데이터셋 분리 (train / val / test)# train / val / test 분리

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

print(f'train data : x{x_train.shape}, y{y_train.shape}')
print(f'val data : x{x_val.shape}, y{y_val.shape}')
print(f'test data : x{x_test.shape}, y{y_test.shape}')

train data : x(1241, 14), y(1241,)
val data : x(311, 14), y(311,)
test data : x(389, 14), y(389,)


## Modeling (1차)

In [90]:
o_sampler = RandomOverSampler(random_state=1)
x_train_o, y_train_o = o_sampler.fit_resample(x_train, y_train)

In [91]:
xgb = XGBClassifier()
xgb.fit(x_train_o, y_train_o)

In [92]:
# 컨퓨전 리포트 출력
y_val_pred1 = xgb.predict(x_val)
report1 = classification_report(y_val, y_val_pred1)
print(report1)

              precision    recall  f1-score   support

           0       0.87      0.46      0.60        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       1.00      1.00      1.00        12
           4       0.89      1.00      0.94         8
           5       0.80      0.79      0.79        75
           6       0.77      0.88      0.82       108

    accuracy                           0.84       311
   macro avg       0.89      0.86      0.86       311
weighted avg       0.85      0.84      0.84       311



In [96]:
# x_val.index와 y_val_pred1을 하나의 데이터프레임으로 만들어주기
result_df1 = pd.concat([pd.DataFrame(y_val_pred1, columns=['Pred'])], axis=1)
result_df1 = result_df1.set_index(x_val.index)

print(result_df1)

      Pred
1539     6
656      2
1408     6
69       5
1038     5
...    ...
1464     5
1721     6
768      3
534      2
163      1

[311 rows x 1 columns]


# Modeling 2차

In [97]:
# 'Target' 컬럼이 5 또는 6인 행만 추출
df_56 = df[(df['Target'] == 5) | (df['Target'] == 6)]

# 'Empty_Index' 컬럼 드롭하기
df_56 = df_56.drop('Empty_Index', axis=1)
df_56

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Type_of_Steel,Target
1605,1117,1154,1194359,1194414,1055,69,56,110262,72,125,1687,80,0,6
1502,1572,1610,337559,337645,1570,176,128,128770,29,111,1692,300,1,6
976,1317,1324,2166062,2166078,78,12,16,9381,110,132,1352,40,1,5
1052,386,394,1304617,1304626,51,10,9,5795,100,133,1624,70,0,5
1202,227,256,1593586,1593597,126,51,36,15951,112,148,1368,80,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,355,378,2820561,2820612,631,46,51,68349,91,133,1364,40,1,5
1130,1192,1205,521912,521925,101,22,13,11601,108,125,1356,40,1,5
1294,14,24,149048,149070,150,12,22,17566,99,140,1687,200,1,6
1459,220,229,3094414,3094424,54,10,10,6091,104,124,1692,70,1,6


In [98]:
target = 'Target'

x_56 = df_56.drop(target, axis=1)
y_56 = df_56[target]

x_train_56, x_test_56, y_train_56, y_test_56 = train_test_split(x_56, y_56, test_size=0.1, random_state=1)
x_train_56, x_val_56, y_train_56, y_val_56 = train_test_split(x_train_56, y_train_56, test_size=0.1, random_state=1)

print(f'train data : x{x_train_56.shape}, y{y_train_56.shape}')
print(f'val data : x{x_val_56.shape}, y{y_val_56.shape}')
print(f'test data : x{x_test_56.shape}, y{y_test_56.shape}')

train data : x(870, 13), y(870,)
val data : x(97, 13), y(97,)
test data : x(108, 13), y(108,)


In [99]:
from imblearn.combine import SMOTETomek

x_train_tomek, y_train_tomek = SMOTETomek().fit_resample(x_train_56, y_train_56) #데이터 증강
print(x_train_tomek.shape, y_train_tomek.shape)

(848, 13) (848,)


In [100]:
# RandomForestClassifier
randomforest = RandomForestClassifier()
randomforest.fit(x_train_tomek, y_train_tomek)

In [101]:
# 컨퓨전 리포트 출력
y_val_pred2 = randomforest.predict(x_val_56)
report2 = classification_report(y_val_56, y_val_pred2)
print(report2)

              precision    recall  f1-score   support

           5       0.65      0.59      0.62        37
           6       0.76      0.80      0.78        60

    accuracy                           0.72        97
   macro avg       0.70      0.70      0.70        97
weighted avg       0.72      0.72      0.72        97



# 모델 합치기

In [102]:
x_val['Pred'] = y_val_pred1
x_val

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Empty_Index,Type_of_Steel,Pred
1539,507,521,835980,835993,121,18,17,15526,119,141,1360,180,0.3352,1,6
656,39,216,2201440,2201499,5933,312,167,614522,38,125,1356,40,0.4319,1,2
1408,1068,1077,188817,188826,60,12,11,6735,99,127,1687,70,0.2593,0,6
69,1610,1618,1944129,1944138,56,12,9,4674,71,101,1656,100,0.2222,0,5
1038,1474,1489,1248833,1248844,114,18,12,13680,101,141,1694,60,0.3091,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1464,642,658,96252,96269,194,17,17,21009,77,134,1694,80,0.2868,0,5
1721,1370,1379,980597,980617,113,13,20,11635,94,116,1387,40,0.3722,1,6
768,805,813,273908,273912,16,8,4,2121,114,148,1358,50,0.5000,1,3
534,129,157,86408,86427,276,39,26,33858,115,135,1362,40,0.4812,1,2


In [103]:
# 모델의 예측 결과가 5 또는 6인 인덱스 추출
selected_rows = x_val[(x_val['Pred'] == 5) | (x_val['Pred'] == 6)]
selected_rows

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Empty_Index,Type_of_Steel,Pred
1539,507,521,835980,835993,121,18,17,15526,119,141,1360,180,0.3352,1,6
1408,1068,1077,188817,188826,60,12,11,6735,99,127,1687,70,0.2593,0,6
69,1610,1618,1944129,1944138,56,12,9,4674,71,101,1656,100,0.2222,0,5
1038,1474,1489,1248833,1248844,114,18,12,13680,101,141,1694,60,0.3091,0,5
1576,0,21,881709,881856,1949,68,162,144252,56,99,1687,80,0.3686,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,451,465,2550845,2551201,2705,213,359,197957,28,126,1364,80,0.4573,0,6
989,1287,1296,1695433,1695447,86,16,14,9848,100,133,1652,69,0.3175,0,5
1804,1057,1066,945549,945557,52,11,8,5035,81,126,1360,300,0.2778,0,6
1464,642,658,96252,96269,194,17,17,21009,77,134,1694,80,0.2868,0,5


In [104]:
selected_rows.drop(['Pred', 'Empty_Index'], axis=1, inplace=True)
selected_rows

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows.drop(['Pred', 'Empty_Index'], axis=1, inplace=True)


Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,Steel_Plate_Thickness,Type_of_Steel
1539,507,521,835980,835993,121,18,17,15526,119,141,1360,180,1
1408,1068,1077,188817,188826,60,12,11,6735,99,127,1687,70,0
69,1610,1618,1944129,1944138,56,12,9,4674,71,101,1656,100,0
1038,1474,1489,1248833,1248844,114,18,12,13680,101,141,1694,60,0
1576,0,21,881709,881856,1949,68,162,144252,56,99,1687,80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,451,465,2550845,2551201,2705,213,359,197957,28,126,1364,80,0
989,1287,1296,1695433,1695447,86,16,14,9848,100,133,1652,69,0
1804,1057,1066,945549,945557,52,11,8,5035,81,126,1360,300,0
1464,642,658,96252,96269,194,17,17,21009,77,134,1694,80,0


In [105]:
selected_rows.index

Index([1539, 1408,   69, 1038, 1576, 1296, 1617, 1112,  985, 1281,
       ...
       1614, 1198, 1419, 1832, 1285,  140,  989, 1804, 1464, 1721],
      dtype='int64', length=198)

In [106]:
# 컨퓨전 리포트 출력
y_val_pred3 = randomforest.predict(selected_rows)
y_val_pred3

array([6, 6, 6, 5, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5,
       5, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6,
       5, 5, 6, 6, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 5, 5, 6, 5, 6, 6, 5, 6,
       5, 6, 6, 5, 5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5,
       6, 6, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 6,
       6, 5, 5, 6, 6, 6, 5, 5, 5, 6, 6, 5, 5, 6, 6, 5, 5, 6, 6, 5, 5, 6,
       6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 6, 6, 6,
       5, 6, 5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 5, 5, 5, 5, 6, 6, 6, 5, 6,
       6, 5, 6, 5, 6, 5, 5, 5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6, 5, 6],
      dtype=int64)

In [112]:
result_df1

Unnamed: 0,Pred
1539,6
656,2
1408,6
69,5
1038,5
...,...
1464,5
1721,6
768,3
534,2


In [114]:
for idx, pred in zip(selected_rows.index, y_val_pred3) :
    result_df1.loc[idx, 'Pred'] = pred
result_df1['Pred']

1539    6
656     2
1408    6
69      6
1038    5
       ..
1464    5
1721    6
768     3
534     2
163     1
Name: Pred, Length: 311, dtype: int64

In [117]:
report_final = classification_report(y_val, result_df1['Pred'])
print(report_final)

              precision    recall  f1-score   support

           0       0.87      0.46      0.60        28
           1       0.91      0.91      0.91        22
           2       1.00      0.95      0.97        58
           3       1.00      1.00      1.00        12
           4       0.89      1.00      0.94         8
           5       0.82      0.89      0.85        75
           6       0.81      0.87      0.84       108

    accuracy                           0.86       311
   macro avg       0.90      0.87      0.87       311
weighted avg       0.87      0.86      0.86       311

