In [None]:
import os
import warnings
import random
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    VotingClassifier,
    GradientBoostingClassifier,
)

from xgboost import XGBClassifier, plot_importance as plot_xgb_importance
from lightgbm import LGBMClassifier, plot_importance as plot_lgb_importance
from catboost import CatBoostClassifier

from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
)
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    roc_auc_score,
    log_loss,
)

from bayes_opt import BayesianOptimization

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore", category=FutureWarning)


### datawig로 증강한 데이터 부르기

In [None]:
train = pd.read_csv("datawig_train.csv")
test = pd.read_csv("datawig_test.csv")

In [None]:
# 비정상/정상 비율
sum(train['target'] == 'AbNormal')/ sum(train['target'] == 'Normal') 

0.0615892651221302

### 칼럼 정리

In [None]:
# 사용 안하는 칼럼

delete_col = ['Wip Line_Dam', 'Process Desc._Dam', 'Insp. Seq No._Dam', 'Insp Judge Code_Dam', 'CURE END POSITION X Unit Time_Dam', 'CURE END POSITION X Judge Value_Dam', 'CURE END POSITION Z Unit Time_Dam', 'CURE END POSITION Z Judge Value_Dam', 'CURE END POSITION Θ Unit Time_Dam', 'CURE END POSITION Θ Judge Value_Dam', 'CURE SPEED Unit Time_Dam', 'CURE SPEED Judge Value_Dam', 'CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION X Unit Time_Dam', 'CURE STANDBY POSITION X Judge Value_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Z Unit Time_Dam', 'CURE STANDBY POSITION Z Judge Value_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam', 'CURE STANDBY POSITION Θ Unit Time_Dam', 'CURE STANDBY POSITION Θ Judge Value_Dam', 'CURE START POSITION X Unit Time_Dam', 'CURE START POSITION X Judge Value_Dam', 'CURE START POSITION Z Collect Result_Dam', 'CURE START POSITION Z Unit Time_Dam', 'CURE START POSITION Z Judge Value_Dam', 'CURE START POSITION Θ Unit Time_Dam', 'CURE START POSITION Θ Judge Value_Dam', 'DISCHARGED SPEED OF RESIN Unit Time_Dam', 'DISCHARGED SPEED OF RESIN Judge Value_Dam', 'DISCHARGED TIME OF RESIN(Stage1) Unit Time_Dam', 'DISCHARGED TIME OF RESIN(Stage1) Judge Value_Dam', 'DISCHARGED TIME OF RESIN(Stage2) Unit Time_Dam', 'DISCHARGED TIME OF RESIN(Stage2) Judge Value_Dam', 'DISCHARGED TIME OF RESIN(Stage3) Unit Time_Dam', 'DISCHARGED TIME OF RESIN(Stage3) Judge Value_Dam', 'Dispense Volume(Stage1) Unit Time_Dam', 'Dispense Volume(Stage1) Judge Value_Dam', 'Dispense Volume(Stage2) Unit Time_Dam', 'Dispense Volume(Stage2) Judge Value_Dam', 'Dispense Volume(Stage3) Unit Time_Dam', 'Dispense Volume(Stage3) Judge Value_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Unit Time_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Unit Time_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Judge Value_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Unit Time_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Judge Value_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Unit Time_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Judge Value_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Unit Time_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Judge Value_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Unit Time_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Judge Value_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Unit Time_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Judge Value_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Unit Time_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Judge Value_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Unit Time_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Judge Value_Dam', 'HEAD Standby Position X Unit Time_Dam', 'HEAD Standby Position X Judge Value_Dam', 'HEAD Standby Position Y Unit Time_Dam', 'HEAD Standby Position Y Judge Value_Dam', 'HEAD Standby Position Z Unit Time_Dam', 'HEAD Standby Position Z Judge Value_Dam', 'Head Clean Position X Unit Time_Dam', 'Head Clean Position X Judge Value_Dam', 'Head Clean Position Y Unit Time_Dam', 'Head Clean Position Y Judge Value_Dam', 'Head Clean Position Z Unit Time_Dam', 'Head Clean Position Z Judge Value_Dam', 'Head Purge Position X Unit Time_Dam', 'Head Purge Position X Judge Value_Dam', 'Head Purge Position Y Unit Time_Dam', 'Head Purge Position Y Judge Value_Dam', 'Head Purge Position Z Unit Time_Dam', 'Head Purge Position Z Judge Value_Dam', 'Head Zero Position X Unit Time_Dam', 'Head Zero Position X Judge Value_Dam', 'Head Zero Position Y Unit Time_Dam', 'Head Zero Position Y Judge Value_Dam', 'Head Zero Position Z Unit Time_Dam', 'Head Zero Position Z Judge Value_Dam', 'Machine Tact time Unit Time_Dam', 'Machine Tact time Judge Value_Dam', 'PalletID Unit Time_Dam', 'PalletID Judge Value_Dam', 'Production Qty Unit Time_Dam', 'Production Qty Judge Value_Dam', 'Receip No Unit Time_Dam', 'Receip No Judge Value_Dam', 'Stage1 Circle1 Distance Speed Unit Time_Dam', 'Stage1 Circle1 Distance Speed Judge Value_Dam', 'Stage1 Circle2 Distance Speed Unit Time_Dam', 'Stage1 Circle2 Distance Speed Judge Value_Dam', 'Stage1 Circle3 Distance Speed Unit Time_Dam', 'Stage1 Circle3 Distance Speed Judge Value_Dam', 'Stage1 Circle4 Distance Speed Unit Time_Dam', 'Stage1 Circle4 Distance Speed Judge Value_Dam', 'Stage1 Line1 Distance Speed Unit Time_Dam', 'Stage1 Line1 Distance Speed Judge Value_Dam', 'Stage1 Line2 Distance Speed Unit Time_Dam', 'Stage1 Line2 Distance Speed Judge Value_Dam', 'Stage1 Line3 Distance Speed Unit Time_Dam', 'Stage1 Line3 Distance Speed Judge Value_Dam', 'Stage1 Line4 Distance Speed Unit Time_Dam', 'Stage1 Line4 Distance Speed Judge Value_Dam', 'Stage2 Circle1 Distance Speed Unit Time_Dam', 'Stage2 Circle1 Distance Speed Judge Value_Dam', 'Stage2 Circle2 Distance Speed Unit Time_Dam', 'Stage2 Circle2 Distance Speed Judge Value_Dam', 'Stage2 Circle3 Distance Speed Unit Time_Dam', 'Stage2 Circle3 Distance Speed Judge Value_Dam', 'Stage2 Circle4 Distance Speed Unit Time_Dam', 'Stage2 Circle4 Distance Speed Judge Value_Dam', 'Stage2 Line1 Distance Speed Unit Time_Dam', 'Stage2 Line1 Distance Speed Judge Value_Dam', 'Stage2 Line2 Distance Speed Unit Time_Dam', 'Stage2 Line2 Distance Speed Judge Value_Dam', 'Stage2 Line3 Distance Speed Unit Time_Dam', 'Stage2 Line3 Distance Speed Judge Value_Dam', 'Stage2 Line4 Distance Speed Unit Time_Dam', 'Stage2 Line4 Distance Speed Judge Value_Dam', 'Stage3 Circle1 Distance Speed Unit Time_Dam', 'Stage3 Circle1 Distance Speed Judge Value_Dam', 'Stage3 Circle2 Distance Speed Unit Time_Dam', 'Stage3 Circle2 Distance Speed Judge Value_Dam', 'Stage3 Circle3 Distance Speed Unit Time_Dam', 'Stage3 Circle3 Distance Speed Judge Value_Dam', 'Stage3 Circle4 Distance Speed Unit Time_Dam', 'Stage3 Circle4 Distance Speed Judge Value_Dam', 'Stage3 Line1 Distance Speed Unit Time_Dam', 'Stage3 Line1 Distance Speed Judge Value_Dam', 'Stage3 Line2 Distance Speed Unit Time_Dam', 'Stage3 Line2 Distance Speed Judge Value_Dam', 'Stage3 Line3 Distance Speed Unit Time_Dam', 'Stage3 Line3 Distance Speed Judge Value_Dam', 'Stage3 Line4 Distance Speed Unit Time_Dam', 'Stage3 Line4 Distance Speed Judge Value_Dam', 'THICKNESS 1 Unit Time_Dam', 'THICKNESS 1 Judge Value_Dam', 'THICKNESS 2 Unit Time_Dam', 'THICKNESS 2 Judge Value_Dam', 'THICKNESS 3 Unit Time_Dam', 'THICKNESS 3 Judge Value_Dam', 'WorkMode Unit Time_Dam', 'WorkMode Judge Value_Dam', 'GMES_ORIGIN_INSP_JUDGE_CODE Unit Time_AutoClave', 'DISCHARGED SPEED OF RESIN Unit Time_Fill1', 'DISCHARGED SPEED OF RESIN Judge Value_Fill1', 'DISCHARGED TIME OF RESIN(Stage1) Unit Time_Fill1', 'DISCHARGED TIME OF RESIN(Stage1) Judge Value_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Unit Time_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Judge Value_Fill1', 'DISCHARGED TIME OF RESIN(Stage3) Unit Time_Fill1', 'DISCHARGED TIME OF RESIN(Stage3) Judge Value_Fill1', 'Dispense Volume(Stage1) Unit Time_Fill1', 'Dispense Volume(Stage1) Judge Value_Fill1', 'Dispense Volume(Stage2) Unit Time_Fill1', 'Dispense Volume(Stage2) Judge Value_Fill1', 'Dispense Volume(Stage3) Unit Time_Fill1', 'Dispense Volume(Stage3) Judge Value_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Unit Time_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Unit Time_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Judge Value_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Unit Time_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Judge Value_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Unit Time_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Judge Value_Fill1', 'HEAD Standby Position X Unit Time_Fill1', 'HEAD Standby Position X Judge Value_Fill1', 'HEAD Standby Position Y Unit Time_Fill1', 'HEAD Standby Position Y Judge Value_Fill1', 'HEAD Standby Position Z Unit Time_Fill1', 'HEAD Standby Position Z Judge Value_Fill1', 'Head Clean Position X Unit Time_Fill1', 'Head Clean Position X Judge Value_Fill1', 'Head Clean Position Y Unit Time_Fill1', 'Head Clean Position Y Judge Value_Fill1', 'Head Clean Position Z Unit Time_Fill1', 'Head Clean Position Z Judge Value_Fill1', 'Head Purge Position X Unit Time_Fill1', 'Head Purge Position X Judge Value_Fill1', 'Head Purge Position Y Unit Time_Fill1', 'Head Purge Position Y Judge Value_Fill1', 'Head Purge Position Z Unit Time_Fill1', 'Head Purge Position Z Judge Value_Fill1', 'Machine Tact time Unit Time_Fill1', 'Machine Tact time Judge Value_Fill1', 'PalletID Unit Time_Fill1', 'PalletID Judge Value_Fill1', 'Production Qty Unit Time_Fill1', 'Production Qty Judge Value_Fill1', 'Receip No Unit Time_Fill1', 'Receip No Judge Value_Fill1', 'WorkMode Unit Time_Fill1', 'WorkMode Judge Value_Fill1', 'CURE END POSITION X Unit Time_Fill2', 'CURE END POSITION X Judge Value_Fill2', 'CURE END POSITION Z Unit Time_Fill2', 'CURE END POSITION Z Judge Value_Fill2', 'CURE END POSITION Θ Unit Time_Fill2', 'CURE END POSITION Θ Judge Value_Fill2', 'CURE SPEED Unit Time_Fill2', 'CURE SPEED Judge Value_Fill2', 'CURE STANDBY POSITION X Unit Time_Fill2', 'CURE STANDBY POSITION X Judge Value_Fill2', 'CURE STANDBY POSITION Z Unit Time_Fill2', 'CURE STANDBY POSITION Z Judge Value_Fill2', 'CURE STANDBY POSITION Θ Unit Time_Fill2', 'CURE STANDBY POSITION Θ Judge Value_Fill2', 'CURE START POSITION X Unit Time_Fill2', 'CURE START POSITION X Judge Value_Fill2', 'CURE START POSITION Z Unit Time_Fill2', 'CURE START POSITION Z Judge Value_Fill2', 'CURE START POSITION Θ Unit Time_Fill2', 'CURE START POSITION Θ Judge Value_Fill2', 'DISCHARGED SPEED OF RESIN Unit Time_Fill2', 'DISCHARGED SPEED OF RESIN Judge Value_Fill2', 'DISCHARGED TIME OF RESIN(Stage1) Unit Time_Fill2', 'DISCHARGED TIME OF RESIN(Stage1) Judge Value_Fill2', 'DISCHARGED TIME OF RESIN(Stage2) Unit Time_Fill2', 'DISCHARGED TIME OF RESIN(Stage2) Judge Value_Fill2', 'DISCHARGED TIME OF RESIN(Stage3) Unit Time_Fill2', 'DISCHARGED TIME OF RESIN(Stage3) Judge Value_Fill2', 'Dispense Volume(Stage1) Unit Time_Fill2', 'Dispense Volume(Stage1) Judge Value_Fill2', 'Dispense Volume(Stage2) Unit Time_Fill2', 'Dispense Volume(Stage2) Judge Value_Fill2', 'Dispense Volume(Stage3) Unit Time_Fill2', 'Dispense Volume(Stage3) Judge Value_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Unit Time_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Unit Time_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Judge Value_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Unit Time_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Unit Time_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Judge Value_Fill2', 'HEAD Standby Position X Unit Time_Fill2', 'HEAD Standby Position X Judge Value_Fill2', 'HEAD Standby Position Y Unit Time_Fill2', 'HEAD Standby Position Y Judge Value_Fill2', 'HEAD Standby Position Z Unit Time_Fill2', 'HEAD Standby Position Z Judge Value_Fill2', 'Head Clean Position X Unit Time_Fill2', 'Head Clean Position X Judge Value_Fill2', 'Head Clean Position Y Unit Time_Fill2', 'Head Clean Position Y Judge Value_Fill2', 'Head Clean Position Z Unit Time_Fill2', 'Head Clean Position Z Judge Value_Fill2', 'Head Purge Position X Unit Time_Fill2', 'Head Purge Position X Judge Value_Fill2', 'Head Purge Position Y Unit Time_Fill2', 'Head Purge Position Y Judge Value_Fill2', 'Head Purge Position Z Unit Time_Fill2', 'Head Purge Position Z Judge Value_Fill2', 'Machine Tact time Unit Time_Fill2', 'Machine Tact time Judge Value_Fill2', 'PalletID Unit Time_Fill2', 'PalletID Judge Value_Fill2', 'Production Qty Unit Time_Fill2', 'Production Qty Judge Value_Fill2', 'Receip No Unit Time_Fill2', 'Receip No Judge Value_Fill2', 'WorkMode Unit Time_Fill2', 'WorkMode Judge Value_Fill2', 'Stage1 Circle3 Distance Speed Collect Result_Dam', 'Stage1 Circle4 Distance Speed Collect Result_Dam', 'Stage1 Line3 Distance Speed Collect Result_Dam', 'Stage2 Circle3 Distance Speed Collect Result_Dam', 'Stage2 Circle4 Distance Speed Collect Result_Dam', 'Stage2 Line1 Distance Speed Collect Result_Dam', 'Stage3 Circle3 Distance Speed Collect Result_Dam', 'Stage3 Circle4 Distance Speed Collect Result_Dam', 'Stage3 Line3 Distance Speed Collect Result_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Model.Suffix_AutoClave', 'Workorder_AutoClave', 'Insp. Seq No._AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Model.Suffix_Fill1', 'Workorder_Fill1', 'Insp. Seq No._Fill1', 'Insp Judge Code_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Model.Suffix_Fill2', 'Workorder_Fill2', 'Insp. Seq No._Fill2', 'Insp Judge Code_Fill2', 'CURE END POSITION Θ Collect Result_Fill2', 'CURE STANDBY POSITION X Collect Result_Fill2', 'CURE STANDBY POSITION Θ Collect Result_Fill2', 'CURE START POSITION Z Collect Result_Fill2', 'CURE START POSITION Θ Collect Result_Fill2', 'DISCHARGED SPEED OF RESIN Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2', 'Dispense Volume(Stage1) Collect Result_Fill2', 'Dispense Volume(Stage2) Collect Result_Fill2', 'Dispense Volume(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2', 'Head Clean Position Z Collect Result_Fill2', 'Head Purge Position Y Collect Result_Fill2']

# numerical category 정리
num_col = ['CURE SPEED Collect Result_Dam', 'DISCHARGED SPEED OF RESIN Collect Result_Dam', 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam', 'Dispense Volume(Stage1) Collect Result_Dam', 'Dispense Volume(Stage2) Collect Result_Dam', 'Dispense Volume(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam', 'HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam', 'Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Clean Position Z Collect Result_Dam', 'Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Purge Position Z Collect Result_Dam', 'Head Zero Position X Collect Result_Dam', 'Head Zero Position Y Collect Result_Dam', 'Head Zero Position Z Collect Result_Dam', 'Machine Tact time Collect Result_Dam', 'Stage1 Circle1 Distance Speed Collect Result_Dam', 'Stage1 Circle2 Distance Speed Collect Result_Dam', 'Stage1 Line1 Distance Speed Collect Result_Dam', 'Stage1 Line2 Distance Speed Collect Result_Dam', 'Stage1 Line4 Distance Speed Collect Result_Dam', 'Stage2 Circle1 Distance Speed Collect Result_Dam', 'Stage2 Circle2 Distance Speed Collect Result_Dam', 'Stage2 Line2 Distance Speed Collect Result_Dam', 'Stage2 Line3 Distance Speed Collect Result_Dam', 'Stage2 Line4 Distance Speed Collect Result_Dam', 'Stage3 Circle1 Distance Speed Collect Result_Dam', 'Stage3 Circle2 Distance Speed Collect Result_Dam', 'Stage3 Line1 Distance Speed Collect Result_Dam', 'Stage3 Line2 Distance Speed Collect Result_Dam', 'Stage3 Line4 Distance Speed Collect Result_Dam', 'THICKNESS 1 Collect Result_Dam', 'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam', '1st Pressure Collect Result_AutoClave', '1st Pressure 1st Pressure Unit Time_AutoClave', '2nd Pressure Collect Result_AutoClave', '2nd Pressure Unit Time_AutoClave', '3rd Pressure Collect Result_AutoClave', '3rd Pressure Unit Time_AutoClave', 'Chamber Temp. Collect Result_AutoClave', 'Chamber Temp. Unit Time_AutoClave', 'DISCHARGED SPEED OF RESIN Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1', 'Dispense Volume(Stage1) Collect Result_Fill1', 'Dispense Volume(Stage2) Collect Result_Fill1', 'Dispense Volume(Stage3) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1', 'HEAD Standby Position X Collect Result_Fill1', 'HEAD Standby Position Y Collect Result_Fill1', 'HEAD Standby Position Z Collect Result_Fill1', 'Head Clean Position X Collect Result_Fill1', 'Head Clean Position Y Collect Result_Fill1', 'Head Clean Position Z Collect Result_Fill1', 'Head Purge Position X Collect Result_Fill1', 'Head Purge Position Y Collect Result_Fill1', 'Head Purge Position Z Collect Result_Fill1', 'Machine Tact time Collect Result_Fill1', 'CURE SPEED Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2', 'HEAD Standby Position Y Collect Result_Fill2', 'HEAD Standby Position Z Collect Result_Fill2', 'Head Clean Position X Collect Result_Fill2', 'Head Clean Position Y Collect Result_Fill2', 'Head Purge Position X Collect Result_Fill2', 'Head Purge Position Z Collect Result_Fill2', 'Machine Tact time Collect Result_Fill2', 'HEAD Standby Position X Collect Result_Fill2']

# category 칼럼럼
cat_col = ['Model.Suffix_Dam', 'Workorder_Dam', 'WorkMode Collect Result_Dam', 'Chamber Temp. Judge Value_AutoClave', 'WorkMode Collect Result_Fill1', 'WorkMode Collect Result_Fill2']

# 애매한거매한거
cat_or_num_col = ['Production Qty Collect Result_Dam', 'Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2', 'Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2', 'Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']
mid_col = ['CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam', 'CURE END POSITION Θ Collect Result_Dam', 'CURE START POSITION X Collect Result_Dam', 'CURE START POSITION Θ Collect Result_Dam', 'PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'CURE END POSITION X Collect Result_Fill2', 'CURE END POSITION Z Collect Result_Fill2', 'CURE STANDBY POSITION Z Collect Result_Fill2', 'CURE START POSITION X Collect Result_Fill2', 'PalletID Collect Result_Fill2']

In [19]:
cat_col

['Model.Suffix_Dam',
 'Workorder_Dam',
 'WorkMode Collect Result_Dam',
 'Chamber Temp. Judge Value_AutoClave',
 'WorkMode Collect Result_Fill1',
 'WorkMode Collect Result_Fill2']

In [None]:
pd.set_option('display.max_rows', 10)

a = train[cat_or_num_col]

# Equipment 이름들 정규화
a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #1'].index,'Equipment_Dam'] = '#1'
a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #2'].index,'Equipment_Dam'] = '#2'

a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #1'].index, 'Equipment_Fill1' ]= '#1'
a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #2'].index, 'Equipment_Fill1'] = '#2'

a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #1'].index, 'Equipment_Fill2'] = '#1'
a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #2'].index, 'Equipment_Fill2' ]= '#2'

a['Production Qty new'] = '0'
a['Receip new'] = '0'
a['Equipment new'] = '0'


# Dam / Fill1 / Fill2 간의 데이터 일치 여부 체크하고
# 일치하면 flag를 1로 설정
# 전처리 결과 일치하면 불량이었음 
index = a[(a['Production Qty Collect Result_Dam'] == a['Production Qty Collect Result_Fill1']) & (a['Production Qty Collect Result_Fill1'] == a['Production Qty Collect Result_Fill2'])].index
a.loc[index, 'Production Qty new'] = '1'

index = a[(a['Receip No Collect Result_Dam'] == a['Receip No Collect Result_Fill1']) & (a['Receip No Collect Result_Fill1'] == a['Receip No Collect Result_Fill2'])].index
a.loc[index, 'Receip new'] = '1'

index = a[(a['Equipment_Dam'] == a['Equipment_Fill1']) & (a['Equipment_Fill1'] == a['Equipment_Fill2'])].index
a.loc[index, 'Equipment new'] = '1'

a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Production Qty new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Receip new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Equipment new'] = '0'


Unnamed: 0,Production Qty Collect Result_Dam,Production Qty Collect Result_Fill1,Production Qty Collect Result_Fill2,Receip No Collect Result_Dam,Receip No Collect Result_Fill1,Receip No Collect Result_Fill2,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Production Qty new,Receip new,Equipment new
0,7,7,7,127,127,127,#1,#1,#1,1,1,1
1,185,185,185,1,1,1,#1,#1,#1,1,1,1
2,10,10,10,73,73,73,#2,#2,#2,1,1,1
3,268,268,268,1,1,1,#2,#2,#2,1,1,1
4,121,121,121,1,1,1,#1,#1,#1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
40501,318,318,318,1,1,1,#1,#1,#1,1,1,1
40502,14,14,14,197,197,197,#2,#2,#2,1,1,1
40503,1,1,1,27,27,27,#1,#1,#1,1,1,1
40504,117,117,117,1,1,1,#2,#2,#2,1,1,1


In [22]:
a['9개 total'] = "0"
index = a[(a['Production Qty new'] == '1') &(a['Receip new'] == '1') &(a['Equipment new'] =='1')].index
a.loc[index, '9개 total'] = '1'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['9개 total'] = "0"


### 9개 total이 1인 것만 남기고 나머지 row는 버림
### 위에서 그 버리는 칼럼은abnormal이라고 가정함

In [23]:
np.unique(a['9개 total'],return_counts= True)

(array(['0', '1'], dtype=object), array([   93, 40413]))

In [24]:
train.drop(cat_or_num_col,axis = 1, inplace = True)
train

Unnamed: 0,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,...,HEAD Standby Position Y Collect Result_Fill2,HEAD Standby Position Z Collect Result_Fill2,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,AJX75334505,4F1XA938-1,240.0,2.5,-90,100,1030,-90,16,14.9,...,270,50,-10,119,91.8,50,114.612,19.9,1,Normal
1,AJX75334505,3KPM0016-2,240.0,2.5,-90,70,1030,-90,10,21.3,...,50,-10,119,50,270.0,85,19.600,7.0,0,Normal
2,AJX75334501,4E1X9167-1,1000.0,12.5,90,85,280,90,16,14.7,...,270,50,-10,119,91.8,50,114.612,19.8,1,Normal
3,AJX75334501,3K1X0057-1,1000.0,12.5,90,70,280,90,10,21.3,...,50,-10,119,50,270.0,85,19.900,12.0,0,Normal
4,AJX75334501,3HPM0007-1,240.0,2.5,-90,70,1030,-90,10,9.7,...,50,-10,119,50,270.0,85,19.700,8.0,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,AJX75334501,3J1XF434-2,240.0,2.5,-90,70,1030,-90,10,17.0,...,50,-10,119,50,270.0,85,19.200,1.0,0,Normal
40502,AJX75334501,4E1XC796-1,1000.0,12.5,90,100,280,90,16,14.9,...,270,50,-10,119,91.8,50,114.612,20.5,1,Normal
40503,AJX75334501,4C1XD438-1,240.0,2.5,-90,100,1030,-90,16,14.2,...,270,50,-10,119,91.8,50,85.000,19.7,1,Normal
40504,AJX75334501,3I1XA258-1,1000.0,12.5,90,70,280,90,10,9.7,...,50,-10,119,50,270.0,85,20.100,13.0,0,Normal


In [25]:
train['cat_or_num_col 9개 total'] = a['9개 total']

In [26]:
pd.set_option('display.max_rows', 10)

a = test[cat_or_num_col]

a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #1'].index,'Equipment_Dam'] = '#1'
a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #2'].index,'Equipment_Dam'] = '#2'

a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #1'].index, 'Equipment_Fill1' ]= '#1'
a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #2'].index, 'Equipment_Fill1'] = '#2'

a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #1'].index, 'Equipment_Fill2'] = '#1'
a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #2'].index, 'Equipment_Fill2' ]= '#2'

a['Production Qty new'] = '0'
a['Receip new'] = '0'
a['Equipment new'] = '0'

index = a[(a['Production Qty Collect Result_Dam'] == a['Production Qty Collect Result_Fill1']) & (a['Production Qty Collect Result_Fill1'] == a['Production Qty Collect Result_Fill2'])].index
a.loc[index, 'Production Qty new'] = '1'

index = a[(a['Receip No Collect Result_Dam'] == a['Receip No Collect Result_Fill1']) & (a['Receip No Collect Result_Fill1'] == a['Receip No Collect Result_Fill2'])].index
a.loc[index, 'Receip new'] = '1'

index = a[(a['Equipment_Dam'] == a['Equipment_Fill1']) & (a['Equipment_Fill1'] == a['Equipment_Fill2'])].index
a.loc[index, 'Equipment new'] = '1'

a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Production Qty new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Receip new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Equipment new'] = '0'


Unnamed: 0,Production Qty Collect Result_Dam,Production Qty Collect Result_Fill1,Production Qty Collect Result_Fill2,Receip No Collect Result_Dam,Receip No Collect Result_Fill1,Receip No Collect Result_Fill2,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Production Qty new,Receip new,Equipment new
0,195,195,195,1,1,1,#2,#2,#2,1,1,1
1,14,14,14,256,256,256,#2,#2,#2,1,1,1
2,98,98,98,1,1,1,#1,#1,#1,1,1,1
3,14,14,14,0,0,0,#2,#2,#2,1,1,1
4,1,1,1,215,215,215,#1,#1,#1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17356,14,14,14,131,131,131,#2,#2,#2,1,1,1
17357,12,12,12,279,279,279,#2,#2,#2,1,1,1
17358,4,4,4,66,66,66,#1,#1,#1,1,1,1
17359,117,117,117,1,1,1,#1,#1,#1,1,1,1


In [27]:
a['9개 total'] = "0"
index = a[(a['Production Qty new'] == '1') &(a['Receip new'] == '1') &(a['Equipment new'] =='1')].index
a.loc[index, '9개 total'] = '1'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['9개 total'] = "0"


In [28]:
np.unique(a['9개 total'],return_counts= True)

(array(['0', '1'], dtype=object), array([   28, 17333]))

In [29]:
test.drop(cat_or_num_col,axis = 1, inplace = True)
test

Unnamed: 0,Set ID,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,HEAD Standby Position Y Collect Result_Fill2,HEAD Standby Position Z Collect Result_Fill2,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,0001be084fbc4aaa9d921f39e595961b,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,50,-10,119,50,270.0,85,19.8,13.0,0,
1,0005bbd180064abd99e63f9ed3e1ac80,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,270,50,-10,119,91.8,50,85.0,19.8,1,
2,000948934c4140d883d670adcb609584,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,50,-10,119,50,270.0,85,19.7,1.0,0,
3,000a6bfd02874c6296dc7b2e9c5678a7,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,270,50,-10,119,91.8,50,85.0,20.0,1,
4,0018e78ce91343678716e2ea27a51c95,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,270,50,-10,119,91.8,50,85.0,19.8,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17356,ffea508b59934d689b540f95eb3fa730,AJX75334501,3K1XB597-1,1000.0,12.5,90,70,280,90,10,...,270,50,-10,119,91.8,50,85.0,19.5,1,
17357,ffed8923c8a448a98afc641b770be153,AJX75334501,4A1XB974-1,1000.0,12.5,90,70,280,90,16,...,270,50,-10,119,91.8,50,85.0,19.8,1,
17358,fff1e73734da40adbe805359b3efb462,AJX75334501,3L1XA998-1,240.0,2.5,-90,70,1030,-90,16,...,270,50,-10,119,91.8,50,85.0,20.5,1,
17359,fff8e38bdd09470baf95f71e92075dec,AJX75334501,3F1XC376-1,240.0,2.5,-90,70,1030,-90,10,...,50,-10,119,50,270.0,85,18.9,1.0,0,


In [30]:
test['cat_or_num_col 9개 total'] = a['9개 total']

In [31]:
pd.set_option('display.max_rows', 10)
test_index = pd.read_csv("data/test.csv")
a = test_index[cat_or_num_col]

a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #1'].index,'Equipment_Dam'] = '#1'
a.loc[a[a['Equipment_Dam'] == 'Dam dispenser #2'].index,'Equipment_Dam'] = '#2'

a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #1'].index, 'Equipment_Fill1' ]= '#1'
a.loc[a[a['Equipment_Fill1'] == 'Fill1 dispenser #2'].index, 'Equipment_Fill1'] = '#2'

a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #1'].index, 'Equipment_Fill2'] = '#1'
a.loc[a[a['Equipment_Fill2'] == 'Fill2 dispenser #2'].index, 'Equipment_Fill2' ]= '#2'

a['Production Qty new'] = '0'
a['Receip new'] = '0'
a['Equipment new'] = '0'

index = a[(a['Production Qty Collect Result_Dam'] == a['Production Qty Collect Result_Fill1']) & (a['Production Qty Collect Result_Fill1'] == a['Production Qty Collect Result_Fill2'])].index
a.loc[index, 'Production Qty new'] = '1'

index = a[(a['Receip No Collect Result_Dam'] == a['Receip No Collect Result_Fill1']) & (a['Receip No Collect Result_Fill1'] == a['Receip No Collect Result_Fill2'])].index
a.loc[index, 'Receip new'] = '1'

index = a[(a['Equipment_Dam'] == a['Equipment_Fill1']) & (a['Equipment_Fill1'] == a['Equipment_Fill2'])].index
a.loc[index, 'Equipment new'] = '1'

a
a['9개 total'] = "0"
index = a[(a['Production Qty new'] == '1') &(a['Receip new'] == '1') &(a['Equipment new'] =='1')].index
a.loc[index, '9개 total'] = '1'
test_index['cat_or_num_col 9개 total'] = a['9개 total']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Production Qty new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Receip new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['Equipment new'] = '0'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See 

In [32]:
test_index_1 = test_index[test_index['cat_or_num_col 9개 total'] == '1'].index
test_index_0 = test_index[test_index['cat_or_num_col 9개 total'] == '0'].index

In [33]:
train = train[train['cat_or_num_col 9개 total'] == '1']
train.drop('cat_or_num_col 9개 total', axis = 1, inplace = True)

# test = test[test['cat_or_num_col 9개 total'] == '1']
test.drop('cat_or_num_col 9개 total', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop('cat_or_num_col 9개 total', axis = 1, inplace = True)


In [34]:
train['target'] = train['target'].map({'Normal': 0, 'AbNormal': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['target'] = train['target'].map({'Normal': 0, 'AbNormal': 1})


In [35]:
columns_to_encode = cat_col
for column in columns_to_encode:
    train[column] = train[column].astype('category')
    test[column] = test[column].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[column] = train[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[column] = train[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[column] = train[column].astype('category')
A value is trying to be set on a copy of a slice from a Da

### 타겟인코딩

In [None]:
from category_encoders import TargetEncoder

# 타겟 인코딩에 사용할 스무딩 값 설정
smoothing_value = 1 

# 타겟 인코더를 각 컬럼에 적용하기 위한 함수
def target_encode(train_df, test_df, columns, target, smoothing):
    encoders = {column: TargetEncoder(smoothing=smoothing) for column in columns}
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()

    for column in columns:
        # 학습 데이터셋에 대해 fit_transform을 사용하여 타겟 인코딩 적용
        train_encoded[column] = encoders[column].fit_transform(train_df[column], target)
        # 테스트 데이터셋에 대해 transform을 사용하여 타겟 인코딩 적용
        test_encoded[column] = encoders[column].transform(test_df[column])

    return train_encoded, test_encoded, encoders


# 타겟 변수 (예: 'target_column')
target_column = 'target'

# train, test 데이터셋에 타겟 인코딩 적용
train_encoded, test_encoded, encoders = target_encode(train, test, columns_to_encode, train[target_column], smoothing=smoothing_value)

# 결과 확인
print("Train encoded shape:", train_encoded.shape)
print("Test encoded shape:", test_encoded.shape)

train = train_encoded
test = test_encoded

Train encoded shape: (40413, 117)
Test encoded shape: (17361, 118)


In [37]:
set(train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'])

{162.4, 548.5, 549.0, 549.5, 550.0, 550.3}

### X와 Y로 나누기

In [39]:
X = train[train.columns.drop('target')]
Y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

In [41]:
# 양성 클래스와 음성 클래스의 비율 계산
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
ratio

16.80286343612335

### optuna로 튜닝

In [47]:
import optuna
from sklearn.metrics import precision_score
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'random_state': 42,
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1e-1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }

    # 모델 생성 및 학습
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
#     score = f1_score(y_test, y_pred, average='binary')
    score = precision_score(y_test, y_pred, average='binary')
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 09:35:46,211] A new study created in memory with name: no-name-1b192797-fe76-4faf-9118-d55b989d22b9
[I 2024-08-28 09:35:48,234] Trial 0 finished with value: 0.1381345926800472 and parameters: {'scale_pos_weight': 9.03528025742961, 'n_estimators': 344, 'alpha': 0.06314483929915776, 'gamma': 0.004593140997967299, 'learning_rate': 0.034456107078653694, 'max_depth': 7}. Best is trial 0 with value: 0.1381345926800472.
[I 2024-08-28 09:35:50,077] Trial 1 finished with value: 0.1391509433962264 and parameters: {'scale_pos_weight': 8.952175714473691, 'n_estimators': 334, 'alpha': 8.579447152721227e-05, 'gamma': 1.821962334579512e-05, 'learning_rate': 0.01707021966533026, 'max_depth': 8}. Best is trial 1 with value: 0.1391509433962264.
[I 2024-08-28 09:35:51,634] Trial 2 finished with value: 0.1669793621013133 and parameters: {'scale_pos_weight': 7.239038189331825, 'n_estimators': 331, 'alpha': 0.004044892593847148, 'gamma': 1.506003550104982e-06, 'learning_rate': 0.03400876870906

[I 2024-08-28 09:36:53,011] Trial 26 finished with value: 0.18552036199095023 and parameters: {'scale_pos_weight': 11.729601434827986, 'n_estimators': 447, 'alpha': 0.02008813138152552, 'gamma': 0.00017165782572959736, 'learning_rate': 0.28101438655737904, 'max_depth': 11}. Best is trial 25 with value: 0.2169811320754717.
[I 2024-08-28 09:36:56,403] Trial 27 finished with value: 0.20270270270270271 and parameters: {'scale_pos_weight': 13.683157418622288, 'n_estimators': 428, 'alpha': 0.03394954742397184, 'gamma': 0.0004988766070345967, 'learning_rate': 0.2748489131124962, 'max_depth': 11}. Best is trial 25 with value: 0.2169811320754717.
[I 2024-08-28 09:37:00,418] Trial 28 finished with value: 0.2019704433497537 and parameters: {'scale_pos_weight': 13.740027568727726, 'n_estimators': 462, 'alpha': 0.08808525275850723, 'gamma': 0.0005319183266650835, 'learning_rate': 0.2798009954471244, 'max_depth': 12}. Best is trial 25 with value: 0.2169811320754717.
[I 2024-08-28 09:37:04,559] Trial

Number of finished trials: 40
Best trial: {'scale_pos_weight': 14.2230571558265, 'n_estimators': 483, 'alpha': 0.04324930583795401, 'gamma': 0.002207235960952834, 'learning_rate': 0.22134203422750065, 'max_depth': 12}


In [32]:
# Best_trial_018 = {'n_estimators': 310, 'alpha': 0.056331161219739995, 'gamma': 2.9365899535459714e-05, 'learning_rate': 0.2989289093582146, 'max_depth': 11}
Best_trial = {'scale_pos_weight': 9.799746216485042, 'n_estimators': 309, 'alpha': 0.00010349643471716144, 'gamma': 6.570549981132866e-05, 'learning_rate': 0.010234773354621975, 'max_depth': 3}

In [42]:
Best_trial_0822_try3 =  {'scale_pos_weight': 9.118845704035804, 'n_estimators': 458, 'alpha': 1.827286698803764e-05, 'gamma': 2.623730892138876e-05, 'learning_rate': 0.011809660887222096, 'max_depth': 3}
xg_0828_try3 = {'scale_pos_weight': 8.078123868948868, 'n_estimators': 419, 'alpha': 1.6252435590611052e-05, 'gamma': 7.505320542375082e-06, 'learning_rate': 0.03587938181514458, 'max_depth': 4}
xg_0828_try3_precision = {'scale_pos_weight': 14.2230571558265, 'n_estimators': 483, 'alpha': 0.04324930583795401, 'gamma': 0.002207235960952834, 'learning_rate': 0.22134203422750065, 'max_depth': 12}

In [43]:
model_xgb_pre = xgb.XGBClassifier(random_state=42,
                              scale_pos_weight=xg_0828_try3_precision['scale_pos_weight'],
                              max_depth = xg_0828_try3_precision['max_depth'],
                              n_estimators= xg_0828_try3_precision['n_estimators'],
                              alpha = xg_0828_try3_precision['alpha'],
                              gamma = xg_0828_try3_precision['gamma'],
                              learning_rate = xg_0828_try3_precision['learning_rate'],
                             ) 

model_xgb_pre.fit(X_train, y_train)


In [44]:
y_pred_xgb = model_xgb_pre.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f1_score(y_test, y_pred_xgb, average='binary'))

XGBoost Accuracy: 0.9305950760856118
0.1461187214611872


In [45]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[0, 1])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[0, 1])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[0, 1])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [46]:
pred_xgb = model_xgb_pre.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_xgb)

오차행렬:
 [[7474  168]
 [ 393   48]]

정확도: 0.9306
정밀도: 0.2222
재현율: 0.1088
F1: 0.1461


In [47]:
np.unique(y_test,return_counts = True)

(array([0, 1]), array([7642,  441]))

In [48]:
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)

In [49]:
test_pred_xgb = model_xgb_pre.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수

In [50]:
np.unique(test_pred_xgb,return_counts = True)

(array([0, 1]), array([16889,   472]))

In [51]:
test_pred_labels_xgb = np.where(test_pred_xgb == 1, 'AbNormal', 'Normal')

In [52]:
print(len(test_index_0))
print(len(test_index_1))

28
17333


In [53]:
np.unique(test_pred_labels_xgb,return_counts = True)

(array(['AbNormal', 'Normal'], dtype='<U8'), array([  472, 16889]))

In [54]:
np.unique(test_pred_labels_xgb[test_index_0],return_counts = True)

(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 7, 21]))

In [113]:
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_xgb = pd.read_csv("submission.csv")
df_sub_xgb["target"] = test_pred_labels_xgb
df_sub_xgb.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels

In [114]:
np.unique(df_sub_xgb['target'],return_counts = True)

(array(['AbNormal', 'Normal'], dtype=object), array([  493, 16868]))

In [115]:
np.unique(df_sub_xgb['target'],return_counts = True)[1][0]/np.unique(df_sub_xgb['target'],return_counts = True)[1][1]

0.029226938581930283

In [116]:
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
np.unique(df_sub_xgb.loc[index, 'target'],return_counts= True)

(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))

재현율 기준으로 학습

In [67]:
import optuna
from sklearn.metrics import recall_score
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'random_state': 42,
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1e-1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }

    # 모델 생성 및 학습
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
#     score = f1_score(y_test, y_pred, average='binary')
    score = recall_score(y_test, y_pred, average='binary')
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 09:40:25,649] A new study created in memory with name: no-name-fe3fc95f-6546-4a6f-9ce0-0b266972717a
[I 2024-08-28 09:40:28,632] Trial 0 finished with value: 0.09070294784580499 and parameters: {'scale_pos_weight': 9.508359687055346, 'n_estimators': 307, 'alpha': 0.03865194826156447, 'gamma': 0.0036531093635890914, 'learning_rate': 0.2887692341606431, 'max_depth': 12}. Best is trial 0 with value: 0.09070294784580499.
[I 2024-08-28 09:40:30,053] Trial 1 finished with value: 0.48072562358276644 and parameters: {'scale_pos_weight': 12.469278946694942, 'n_estimators': 447, 'alpha': 4.632964084669322e-05, 'gamma': 0.03741868368261565, 'learning_rate': 0.03332860429563729, 'max_depth': 4}. Best is trial 1 with value: 0.48072562358276644.
[I 2024-08-28 09:40:31,814] Trial 2 finished with value: 0.14512471655328799 and parameters: {'scale_pos_weight': 9.231715679221162, 'n_estimators': 420, 'alpha': 0.000534565060352818, 'gamma': 5.805482343268823e-05, 'learning_rate': 0.231565994

[I 2024-08-28 09:41:09,267] Trial 26 finished with value: 0.5941043083900227 and parameters: {'scale_pos_weight': 15.762450638694801, 'n_estimators': 403, 'alpha': 4.367393433284759e-05, 'gamma': 0.0005673237459184727, 'learning_rate': 0.022637065054695076, 'max_depth': 5}. Best is trial 11 with value: 0.6689342403628118.
[I 2024-08-28 09:41:10,733] Trial 27 finished with value: 0.5374149659863946 and parameters: {'scale_pos_weight': 12.996838567558648, 'n_estimators': 379, 'alpha': 1.8684107523282107e-05, 'gamma': 0.0023804058940057275, 'learning_rate': 0.010192478771065062, 'max_depth': 4}. Best is trial 11 with value: 0.6689342403628118.
[I 2024-08-28 09:41:11,954] Trial 28 finished with value: 0.6485260770975056 and parameters: {'scale_pos_weight': 14.792350193109487, 'n_estimators': 351, 'alpha': 0.0008389459450750309, 'gamma': 0.00019368179141412295, 'learning_rate': 0.014248381593542395, 'max_depth': 3}. Best is trial 11 with value: 0.6689342403628118.
[I 2024-08-28 09:41:14,572

Number of finished trials: 40
Best trial: {'scale_pos_weight': 15.935168396078456, 'n_estimators': 368, 'alpha': 6.81898623573073e-05, 'gamma': 0.0014254318465384535, 'learning_rate': 0.011357141123460132, 'max_depth': 3}


In [59]:
recall0828try3 =  {'scale_pos_weight': 15.935168396078456, 'n_estimators': 368, 'alpha': 6.81898623573073e-05, 'gamma': 0.0014254318465384535, 'learning_rate': 0.011357141123460132, 'max_depth': 3}

In [60]:
model_xgb = xgb.XGBClassifier(random_state=42,
                              scale_pos_weight=recall0828try3['scale_pos_weight'],
                              max_depth = recall0828try3['max_depth'],
                              n_estimators= recall0828try3['n_estimators'],
                              alpha = recall0828try3['alpha'],
                              gamma = recall0828try3['gamma'],
                              learning_rate = recall0828try3['learning_rate'],
                             ) 

model_xgb.fit(X_train, y_train)


In [61]:
y_pred_xgb = model_xgb.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f1_score(y_test, y_pred_xgb, average='binary'))

XGBoost Accuracy: 0.6639861437585055
0.17846339987900786


In [62]:
pred_xgb = model_xgb.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_xgb)

오차행렬:
 [[5072 2570]
 [ 146  295]]

정확도: 0.6640
정밀도: 0.1030
재현율: 0.6689
F1: 0.1785


In [63]:
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_xgb = model_xgb.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_xgb,return_counts = True))
test_pred_labels_xgb = np.where(test_pred_xgb == 1, 'AbNormal', 'Normal')
print(np.unique(test_pred_labels_xgb,return_counts = True))

print(np.unique(test_pred_labels_xgb[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_xgb = pd.read_csv("submission.csv")
df_sub_xgb["target"] = test_pred_labels_xgb
df_sub_xgb.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_xgb['target'],return_counts = True))
print(np.unique(df_sub_xgb['target'],return_counts = True)[1][0]/np.unique(df_sub_xgb['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_xgb.loc[index, 'target'],return_counts= True))

(array([0, 1]), array([10943,  6418]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 6418, 10943]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([21,  7]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 6425, 10936]))
0.5875091441111924
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))


f1기준 학습

In [64]:
import optuna
from sklearn.metrics import recall_score
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'random_state': 42,
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1e-1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12)
    }

    # 모델 생성 및 학습
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='binary')
#     score = recall_score(y_test, y_pred, average='binary')
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-08-28 11:41:18,427] A new study created in memory with name: no-name-e40fb3e6-8f6b-4188-88b9-e95bc659d46e
[I 2024-08-28 11:41:19,961] Trial 0 finished with value: 0.20630861040068202 and parameters: {'scale_pos_weight': 7.8494106136285104, 'n_estimators': 416, 'alpha': 0.07590894635454885, 'gamma': 1.0081135420365447e-06, 'learning_rate': 0.019740391405386182, 'max_depth': 3}. Best is trial 0 with value: 0.20630861040068202.
[I 2024-08-28 11:41:21,076] Trial 1 finished with value: 0.17473118279569894 and parameters: {'scale_pos_weight': 15.973056908926122, 'n_estimators': 315, 'alpha': 3.347941159403046e-05, 'gamma': 1.2261877797450193e-06, 'learning_rate': 0.026136318250832798, 'max_depth': 5}. Best is trial 0 with value: 0.20630861040068202.
[I 2024-08-28 11:41:22,113] Trial 2 finished with value: 0.1813765182186235 and parameters: {'scale_pos_weight': 8.516529606882882, 'n_estimators': 343, 'alpha': 1.3881709489090154e-05, 'g

KeyboardInterrupt: 

In [65]:
f1_0828_try3 = {'scale_pos_weight': 8.86280621729308, 'n_estimators': 316, 'alpha': 6.918323050761552e-05, 'gamma': 2.7181228798013043e-05, 'learning_rate': 0.015474277477706829, 'max_depth': 4}

In [66]:
model_xgb_f1 = xgb.XGBClassifier(random_state=42,
                              scale_pos_weight=f1_0828_try3['scale_pos_weight'],
                              max_depth = f1_0828_try3['max_depth'],
                              n_estimators= f1_0828_try3['n_estimators'],
                              alpha = f1_0828_try3['alpha'],
                              gamma = f1_0828_try3['gamma'],
                              learning_rate = f1_0828_try3['learning_rate'],
                             ) 

model_xgb_f1.fit(X_train, y_train)


In [67]:
y_pred_xgb = model_xgb_f1.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(f1_score(y_test, y_pred_xgb, average='binary'))
pred_xgb = model_xgb_f1.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_xgb)

XGBoost Accuracy: 0.8540145985401459
0.20162381596752368
오차행렬:
 [[6754  888]
 [ 292  149]]

정확도: 0.8540
정밀도: 0.1437
재현율: 0.3379
F1: 0.2016


In [68]:
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_xgb = model_xgb_f1.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_xgb,return_counts = True))
test_pred_labels_xgb = np.where(test_pred_xgb == 1, 'AbNormal', 'Normal')
print(np.unique(test_pred_labels_xgb,return_counts = True))

print(np.unique(test_pred_labels_xgb[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_xgb = pd.read_csv("submission.csv")
df_sub_xgb["target"] = test_pred_labels_xgb
df_sub_xgb.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_xgb['target'],return_counts = True))
print(np.unique(df_sub_xgb['target'],return_counts = True)[1][0]/np.unique(df_sub_xgb['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_xgb.loc[index, 'target'],return_counts= True))

(array([0, 1]), array([15006,  2355]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 2355, 15006]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 9, 19]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 2374, 14987]))
0.15840395009007807
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))


In [69]:
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_xgb = model_xgb_pre.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_xgb,return_counts = True))
test_pred_labels_xgb = np.where(test_pred_xgb == 1, 'AbNormal', 'Normal')
print(np.unique(test_pred_labels_xgb,return_counts = True))

print(np.unique(test_pred_labels_xgb[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_xgb = pd.read_csv("submission.csv")
df_sub_xgb["target"] = test_pred_labels_xgb
df_sub_xgb.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_xgb['target'],return_counts = True))
print(np.unique(df_sub_xgb['target'],return_counts = True)[1][0]/np.unique(df_sub_xgb['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_xgb.loc[index, 'target'],return_counts= True))

(array([0, 1]), array([16889,   472]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([  472, 16889]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 7, 21]))
(array(['AbNormal', 'Normal'], dtype=object), array([  493, 16868]))
0.029226938581930283
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))


### lgbm f1 기준으로 학습

In [None]:
from lightgbm import LGBMClassifier
import optuna
#### LGBM
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
            'random_state' : 42,
            'n_estimators': trial.suggest_int('n_estimators', 50, 160),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 31, 128),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),

        }
        
        

    # 모델 생성 및 학습
    model = LGBMClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='binary')
#     score = precision_score(y_test, y_pred, average='binary')

    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [70]:
# pre_0828_try3_lgbm = {'n_estimators': 51, 'learning_rate': 0.019408221863331084, 'num_leaves': 67, 'max_depth': 9, 'scale_pos_weight': 9.024973389511745}
# from lightgbm import LGBMClassifier
# # LightGBM 모델 생성
# model_lgbm_pre = LGBMClassifier(
#     random_state=42,
#     n_estimators=pre_0828_try3_lgbm['n_estimators'],
#     learning_rate= pre_0828_try3_lgbm['learning_rate'],
#     num_leaves=pre_0828_try3_lgbm['num_leaves'],
#     max_depth=pre_0828_try3_lgbm['max_depth'],
#     scale_pos_weight = pre_0828_try3_lgbm['scale_pos_weight']
# )
                            
               
# # 모델 학습
# model_lgbm_pre.fit(X_train, y_train)


pf1_0828_try3_lgbm = {'n_estimators': 55, 'learning_rate': 0.040751490668333945, 'num_leaves': 97, 'max_depth': 5, 'scale_pos_weight': 10.09132588599039}
from lightgbm import LGBMClassifier
# LightGBM 모델 생성
model_lgbm_f1 = LGBMClassifier(
    random_state=42,
    n_estimators=pf1_0828_try3_lgbm['n_estimators'],
    learning_rate= pf1_0828_try3_lgbm['learning_rate'],
    num_leaves=pf1_0828_try3_lgbm['num_leaves'],
    max_depth=pf1_0828_try3_lgbm['max_depth'],
    scale_pos_weight = pf1_0828_try3_lgbm['scale_pos_weight']
)
                            
               
# 모델 학습
model_lgbm_f1.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1816, number of negative: 30514
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 32330, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056171 -> initscore=-2.821549
[LightGBM] [Info] Start training from score -2.821549


In [71]:
# 예측
y_pred_lgbm = model_lgbm_f1.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"LGBM Accuracy: {accuracy_lgbm}")
print(f1_score(y_test, y_pred_lgbm, average='binary'))
pred_lgbm = model_lgbm_f1.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_lgbm)
print(np.unique(y_test,return_counts = True))
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_lgbm = model_lgbm_f1.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_lgbm,return_counts = True))
test_pred_labels_lgbm = np.where(test_pred_lgbm == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_lgbm,return_counts = True))
print(np.unique(test_pred_labels_lgbm[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_lgbm = pd.read_csv("submission.csv")
df_sub_lgbm["target"] = test_pred_labels_lgbm
df_sub_lgbm.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_lgbm['target'],return_counts = True))
print(np.unique(df_sub_lgbm['target'],return_counts = True)[1][0]/np.unique(df_sub_lgbm['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_lgbm.loc[index, 'target'],return_counts= True))

LGBM Accuracy: 0.8678708400346407
0.21929824561403508
오차행렬:
 [[6865  777]
 [ 291  150]]

정확도: 0.8679
정밀도: 0.1618
재현율: 0.3401
F1: 0.2193
(array([0, 1]), array([7642,  441]))
(array([0, 1]), array([15334,  2027]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 2027, 15334]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([11, 17]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 2044, 15317]))
0.1334464973558791
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))


In [72]:
df_sub_lgbm.to_csv('submission.csv',index = False) # 0.18762214983713354

# cat

In [153]:
from catboost import CatBoostClassifier
import optuna
#### catboost
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
            'iterations': trial.suggest_int('iterations',300, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 3, 12),
            'silent' : True,
            'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),
            'random_state' : trial.suggest_int('random_state', 1, 60),
            'thread_count' : -1
        }
        
        

    # 모델 생성 및 학습
    model = CatBoostClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='binary')

    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 10:23:23,403] A new study created in memory with name: no-name-7ddff957-3e48-4435-845f-bd56110fca42
[I 2024-08-28 10:23:26,896] Trial 0 finished with value: 0.20233463035019456 and parameters: {'iterations': 416, 'learning_rate': 0.03480252335487822, 'depth': 3, 'scale_pos_weight': 10.203375952900586, 'random_state': 40}. Best is trial 0 with value: 0.20233463035019456.
[I 2024-08-28 10:24:10,730] Trial 1 finished with value: 0.1277860326894502 and parameters: {'iterations': 345, 'learning_rate': 0.1790146951760621, 'depth': 12, 'scale_pos_weight': 10.383542437947938, 'random_state': 58}. Best is trial 0 with value: 0.20233463035019456.
[I 2024-08-28 10:24:57,148] Trial 2 finished with value: 0.13733905579399142 and parameters: {'iterations': 397, 'learning_rate': 0.09942702022054584, 'depth': 12, 'scale_pos_weight': 10.34044403519091, 'random_state': 9}. Best is trial 0 with value: 0.20233463035019456.
[I 2024-08-28 10:25:17,600] Trial 3 finished with value: 0.1435257410

[I 2024-08-28 10:29:31,589] Trial 59 finished with value: 0.13603473227206947 and parameters: {'iterations': 476, 'learning_rate': 0.2858453359205634, 'depth': 9, 'scale_pos_weight': 7.913686546082529, 'random_state': 37}. Best is trial 52 with value: 0.2129692832764505.
[I 2024-08-28 10:29:34,837] Trial 60 finished with value: 0.2096195262024408 and parameters: {'iterations': 306, 'learning_rate': 0.012938064140420143, 'depth': 7, 'scale_pos_weight': 8.545231631084912, 'random_state': 21}. Best is trial 52 with value: 0.2129692832764505.
[I 2024-08-28 10:29:39,254] Trial 61 finished with value: 0.2026431718061674 and parameters: {'iterations': 318, 'learning_rate': 0.018034720937635505, 'depth': 8, 'scale_pos_weight': 9.654166934343342, 'random_state': 26}. Best is trial 52 with value: 0.2129692832764505.
[I 2024-08-28 10:29:42,603] Trial 62 finished with value: 0.20780939774983453 and parameters: {'iterations': 313, 'learning_rate': 0.014913326607109938, 'depth': 7, 'scale_pos_weight

[I 2024-08-28 10:31:06,661] Trial 89 finished with value: 0.1880184331797235 and parameters: {'iterations': 338, 'learning_rate': 0.014359923394359327, 'depth': 8, 'scale_pos_weight': 7.296865746057533, 'random_state': 2}. Best is trial 70 with value: 0.21487603305785125.
[I 2024-08-28 10:31:08,618] Trial 90 finished with value: 0.18244703847816687 and parameters: {'iterations': 315, 'learning_rate': 0.12489975261669835, 'depth': 4, 'scale_pos_weight': 13.176136066879316, 'random_state': 36}. Best is trial 70 with value: 0.21487603305785125.
[I 2024-08-28 10:31:10,676] Trial 91 finished with value: 0.2050580997949419 and parameters: {'iterations': 341, 'learning_rate': 0.06738710015305208, 'depth': 4, 'scale_pos_weight': 8.920483614733248, 'random_state': 32}. Best is trial 70 with value: 0.21487603305785125.
[I 2024-08-28 10:31:12,944] Trial 92 finished with value: 0.20573108008817045 and parameters: {'iterations': 369, 'learning_rate': 0.015576626292864016, 'depth': 4, 'scale_pos_wei

Number of finished trials: 100
Best trial: {'iterations': 339, 'learning_rate': 0.015504002784165248, 'depth': 6, 'scale_pos_weight': 8.711700270466679, 'random_state': 28}


In [74]:
Best_trial_cat_f1 = {'iterations': 339, 'learning_rate': 0.015504002784165248, 'depth': 6, 'scale_pos_weight': 8.711700270466679, 'random_state': 28}

In [76]:
from catboost import CatBoostClassifier

# CatBoost 모델 생성
model_catboost_f1 = CatBoostClassifier(
    random_state=Best_trial_cat_f1['random_state'],
    iterations=Best_trial_cat_f1['iterations'], # n_estimators와 비슷합니다.
    learning_rate=Best_trial_cat_f1['learning_rate'],
    depth=Best_trial_cat_f1['depth'], # max_depth와 비슷합니다.
    scale_pos_weight=Best_trial_cat_f1['scale_pos_weight'], # max_depth와 비슷합니다.
    silent=True, # 학습 과정에서 메시지를 출력하지 않습니다.
    thread_count = -1
)

# 모델 학습
model_catboost_f1.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f4bc1e51840>

In [79]:
# 예측
y_pred_catboost = model_catboost_f1.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Accuracy: {accuracy_catboost}")
print(f1_score(y_test, y_pred_catboost, average='binary'))
pred_cat = model_catboost_f1.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_cat)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_cat = model_catboost_f1.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_cat,return_counts = True))
test_pred_labels_cat = np.where(test_pred_cat == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_cat,return_counts = True))
print(np.unique(test_pred_labels_cat[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_cat = pd.read_csv("submission.csv")
df_sub_cat["target"] = test_pred_labels_cat
df_sub_cat.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_cat['target'],return_counts = True))
print(np.unique(df_sub_cat['target'],return_counts = True)[1][0]/np.unique(df_sub_cat['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_cat.loc[index, 'target'],return_counts= True))


CatBoost Accuracy: 0.8589632562167512
0.21487603305785125
오차행렬:
 [[6787  855]
 [ 285  156]]

정확도: 0.8590
정밀도: 0.1543
재현율: 0.3537
F1: 0.2149
(array([0, 1]), array([15083,  2278]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 2278, 15083]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 9, 19]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 2297, 15064]))
0.15248274030801912
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))


### precision 기준으로 학습 cat


In [80]:
from catboost import CatBoostClassifier
import optuna
#### catboost
# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
            'iterations': trial.suggest_int('iterations',300, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 3, 12),
            'silent' : True,
            'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 7.0, ratio),
            'random_state' : trial.suggest_int('random_state', 1, 60),
            'thread_count' : -1
        }
        
        

    # 모델 생성 및 학습
    model = CatBoostClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
#     score = f1_score(y_test, y_pred, average='binary')
    score = precision_score(y_test, y_pred, average='binary')

    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 11:57:38,542] A new study created in memory with name: no-name-67916cdb-65f8-455b-9856-c731ccdd7e7a
[I 2024-08-28 11:57:41,795] Trial 0 finished with value: 0.14555256064690028 and parameters: {'iterations': 376, 'learning_rate': 0.010447093973966738, 'depth': 5, 'scale_pos_weight': 9.286181478519664, 'random_state': 27}. Best is trial 0 with value: 0.14555256064690028.
[I 2024-08-28 11:57:43,932] Trial 1 finished with value: 0.15353805073431243 and parameters: {'iterations': 335, 'learning_rate': 0.12278769839115222, 'depth': 5, 'scale_pos_weight': 8.14654490448377, 'random_state': 58}. Best is trial 1 with value: 0.15353805073431243.
[I 2024-08-28 11:57:46,197] Trial 2 finished with value: 0.1155527795128045 and parameters: {'iterations': 495, 'learning_rate': 0.21912354277681148, 'depth': 3, 'scale_pos_weight': 12.419370791065711, 'random_state': 59}. Best is trial 1 with value: 0.15353805073431243.
[I 2024-08-28 11:57:50,277] Trial 3 finished with value: 0.10566356720

[I 2024-08-28 12:03:15,833] Trial 30 finished with value: 0.17587939698492464 and parameters: {'iterations': 379, 'learning_rate': 0.27702716339175437, 'depth': 12, 'scale_pos_weight': 9.012274672778846, 'random_state': 55}. Best is trial 27 with value: 0.2131979695431472.
[I 2024-08-28 12:03:30,653] Trial 31 finished with value: 0.1896551724137931 and parameters: {'iterations': 381, 'learning_rate': 0.2929128105458783, 'depth': 10, 'scale_pos_weight': 9.371142977871127, 'random_state': 56}. Best is trial 27 with value: 0.2131979695431472.
[I 2024-08-28 12:03:45,408] Trial 32 finished with value: 0.20909090909090908 and parameters: {'iterations': 383, 'learning_rate': 0.29467047469247704, 'depth': 10, 'scale_pos_weight': 9.442794504413053, 'random_state': 56}. Best is trial 27 with value: 0.2131979695431472.
[I 2024-08-28 12:04:08,005] Trial 33 finished with value: 0.19909502262443438 and parameters: {'iterations': 380, 'learning_rate': 0.29727462717502284, 'depth': 11, 'scale_pos_weig

[I 2024-08-28 12:13:59,480] Trial 60 finished with value: 0.22119815668202766 and parameters: {'iterations': 442, 'learning_rate': 0.14034188366531555, 'depth': 11, 'scale_pos_weight': 8.701556090626637, 'random_state': 9}. Best is trial 60 with value: 0.22119815668202766.
[I 2024-08-28 12:14:26,034] Trial 61 finished with value: 0.1784037558685446 and parameters: {'iterations': 449, 'learning_rate': 0.1447289018900625, 'depth': 11, 'scale_pos_weight': 8.651237677446579, 'random_state': 1}. Best is trial 60 with value: 0.22119815668202766.
[I 2024-08-28 12:14:53,690] Trial 62 finished with value: 0.2094240837696335 and parameters: {'iterations': 469, 'learning_rate': 0.19286389253061986, 'depth': 11, 'scale_pos_weight': 7.305160344173011, 'random_state': 10}. Best is trial 60 with value: 0.22119815668202766.
[I 2024-08-28 12:15:40,927] Trial 63 finished with value: 0.16574585635359115 and parameters: {'iterations': 468, 'learning_rate': 0.18798789654499837, 'depth': 12, 'scale_pos_weig

[I 2024-08-28 12:20:27,306] Trial 91 finished with value: 0.20123839009287925 and parameters: {'iterations': 473, 'learning_rate': 0.2573967618443618, 'depth': 7, 'scale_pos_weight': 7.001391309106264, 'random_state': 11}. Best is trial 78 with value: 0.226890756302521.
[I 2024-08-28 12:20:35,851] Trial 92 finished with value: 0.1729957805907173 and parameters: {'iterations': 458, 'learning_rate': 0.2018335572754516, 'depth': 9, 'scale_pos_weight': 7.387499860715492, 'random_state': 3}. Best is trial 78 with value: 0.226890756302521.
[I 2024-08-28 12:20:41,998] Trial 93 finished with value: 0.21292775665399238 and parameters: {'iterations': 492, 'learning_rate': 0.22836120056049947, 'depth': 8, 'scale_pos_weight': 7.698536872014624, 'random_state': 5}. Best is trial 78 with value: 0.226890756302521.
[I 2024-08-28 12:20:50,925] Trial 94 finished with value: 0.2119815668202765 and parameters: {'iterations': 492, 'learning_rate': 0.2356172181418472, 'depth': 9, 'scale_pos_weight': 7.75377

Number of finished trials: 100
Best trial: {'iterations': 477, 'learning_rate': 0.21149532084747522, 'depth': 9, 'scale_pos_weight': 7.182149667387197, 'random_state': 1}


In [83]:
cat_0828_pre_try3 = {'iterations': 477, 'learning_rate': 0.21149532084747522, 'depth': 9, 'scale_pos_weight': 7.182149667387197, 'random_state': 1}

In [85]:
from catboost import CatBoostClassifier

# CatBoost 모델 생성
model_catboost_pre = CatBoostClassifier(
    random_state=cat_0828_pre_try3['random_state'],
    iterations=cat_0828_pre_try3['iterations'], # n_estimators와 비슷합니다.
    learning_rate=cat_0828_pre_try3['learning_rate'],
    depth=cat_0828_pre_try3['depth'], # max_depth와 비슷합니다.
    scale_pos_weight=cat_0828_pre_try3['scale_pos_weight'], # max_depth와 비슷합니다.
    silent=True, # 학습 과정에서 메시지를 출력하지 않습니다.
    thread_count = -1
)

# 모델 학습
model_catboost_pre.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f4bb5bbae90>

In [104]:
# 예측
y_pred_catboost = model_catboost_pre.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Accuracy: {accuracy_catboost}")
print(f1_score(y_test, y_pred_catboost, average='binary'))
pred_cat = model_catboost_pre.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_cat)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_cat = model_catboost_pre.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_cat,return_counts = True))
test_pred_labels_cat = np.where(test_pred_cat == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_cat,return_counts = True))
print(np.unique(test_pred_labels_cat[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_cat = pd.read_csv("submission.csv")
df_sub_cat["target"] = test_pred_labels_cat
df_sub_cat.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_cat['target'],return_counts = True))
print(np.unique(df_sub_cat['target'],return_counts = True)[1][0]/np.unique(df_sub_cat['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_cat.loc[index, 'target'],return_counts= True))


CatBoost Accuracy: 0.9312136582951874
0.147239263803681
오차행렬:
 [[7479  163]
 [ 393   48]]

정확도: 0.9312
정밀도: 0.2275
재현율: 0.1088
F1: 0.1472
(array([0, 1]), array([16867,   494]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([  494, 16867]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 5, 23]))
(array(['AbNormal', 'Normal'], dtype=object), array([  517, 16844]))
0.030693421990026122
(array(['AbNormal', 'Normal'], dtype=object), array([20,  4]))


In [112]:
a = set(list(df_sub_cat[df_sub_cat['target'] == 'AbNormal'].index))
b = pd.read_csv('submission_828_cat3.csv')
c = set(list(b[b['target'] == 'AbNormal'].index))
print(len(a))
print(len(c))
print(len(a - c))
len(a.intersection(c))

517
142
413


104

# voting

In [94]:
from sklearn.ensemble import VotingClassifier

weights = [0.33,0.06666,0.266,0.133,0.2]

# 앙상블을 위한 분류기 리스트 생성
classifiers = [
#     ('lgbm', model_lgbm),
    ('xgb_pre', model_xgb_pre),
    ('xgb_f1', model_xgb_f1),
#         ('xgb_pre', model_xgb_pre),
    ('lgbm_f1', model_lgbm_f1),
        ('cat_pre', model_catboost_pre),
    ('cat_f1', model_catboost_f1),

#     ('catboost', model_catboost),
#     ('decisiontree',model_dt)
]

# 소프트 투표 기반의 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=classifiers, voting='hard')

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1816, number of negative: 30514
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 32330, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056171 -> initscore=-2.821549
[LightGBM] [Info] Start training from score -2.821549


In [96]:
# 예측
y_pred_ensemble = ensemble_model.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"voting Accuracy: {accuracy_ensemble}")
print(f1_score(y_test, y_pred_ensemble, average='binary'))
pred_ensem = ensemble_model.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_ensem)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_ensemble = ensemble_model.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_ensemble,return_counts = True))
test_pred_labels_ensemble = np.where(test_pred_ensemble == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_ensemble,return_counts = True))
print(np.unique(test_pred_labels_ensemble[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_ens  = pd.read_csv("submission.csv")
df_sub_ens["target"] = test_pred_labels_ensemble
df_sub_ens.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_ens['target'],return_counts = True))
print(np.unique(df_sub_ens['target'],return_counts = True)[1][0]/np.unique(df_sub_ens['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))
df_sub_ens.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))

voting Accuracy: 0.8764072745267846
0.21892103205629398
오차행렬:
 [[6944  698]
 [ 301  140]]

정확도: 0.8764
정밀도: 0.1671
재현율: 0.3175
F1: 0.2189
(array([0, 1]), array([15514,  1847]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 1847, 15514]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([10, 18]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1865, 15496]))
0.12035363964894166
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))
(array(['AbNormal'], dtype=object), array([24]))


In [98]:
df_sub_ens.to_csv('submission.csv',index = False) # 0.18896551724137928

# extra tree classifier

In [147]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
#         'class_weight': 'balanced',  # 클래스 불균형에 대한 가중치
        'random_state': 42,
                 'class_weight': {
            0: trial.suggest_uniform('class_weight_0', 1,2),
            1: trial.suggest_uniform('class_weight_1', 1, ratio)},
        'n_jobs' : -1
        
    }

    # 모델 생성 및 학습
    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
#     score = precision_score(y_test, y_pred, average='binary')
    score = f1_score(y_test, y_pred, average='binary')

    
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


[I 2024-08-28 14:05:48,699] A new study created in memory with name: no-name-ed91eeaa-a106-4a59-b70b-0a9b6be3de18
[I 2024-08-28 14:05:54,423] Trial 0 finished with value: 0.15037593984962405 and parameters: {'n_estimators': 244, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 0.3937623838095752, 'class_weight_0': 1.3875599660792566, 'class_weight_1': 9.146745520537298}. Best is trial 0 with value: 0.15037593984962405.
[I 2024-08-28 14:05:59,901] Trial 1 finished with value: 0.1821649976156414 and parameters: {'n_estimators': 499, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 0.2819118831548344, 'class_weight_0': 1.1345220657555815, 'class_weight_1': 15.079366701060385}. Best is trial 1 with value: 0.1821649976156414.
[I 2024-08-28 14:06:04,023] Trial 2 finished with value: 0.18871252204585537 and parameters: {'n_estimators': 489, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 9, 'max_features': 0.12155660186

[I 2024-08-28 14:09:12,465] Trial 24 finished with value: 0.20675944333996024 and parameters: {'n_estimators': 180, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 0.9980504120962941, 'class_weight_0': 1.487783731204737, 'class_weight_1': 14.46955153950048}. Best is trial 24 with value: 0.20675944333996024.
[I 2024-08-28 14:09:16,626] Trial 25 finished with value: 0.2068527918781726 and parameters: {'n_estimators': 102, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.9826117414545161, 'class_weight_0': 1.457568770147802, 'class_weight_1': 14.272785578442846}. Best is trial 25 with value: 0.2068527918781726.
[I 2024-08-28 14:09:21,292] Trial 26 finished with value: 0.20552344251766216 and parameters: {'n_estimators': 172, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.889392874981966, 'class_weight_0': 1.4249609227379745, 'class_weight_1': 14.394265592553849}. Best is trial 25 with value: 0

Number of finished trials: 30
Best trial: {'n_estimators': 102, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.9826117414545161, 'class_weight_0': 1.457568770147802, 'class_weight_1': 14.272785578442846}


In [148]:
f1_0828_try3_extratree =  {'n_estimators': 102, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.9826117414545161, 'class_weight_0': 1.457568770147802, 'class_weight_1': 14.272785578442846}

In [149]:
model_extratree_f1 = ExtraTreesClassifier(
    random_state=42,
#     class_weight = 'balanced',
    class_weight = {0:f1_0828_try3_extratree['class_weight_0'],1:f1_0828_try3_extratree['class_weight_1']},
    n_jobs = -1,
    n_estimators = f1_0828_try3_extratree['n_estimators'],
    max_depth = f1_0828_try3_extratree['max_depth'],
    min_samples_split = f1_0828_try3_extratree['min_samples_split'],
    min_samples_leaf = f1_0828_try3_extratree['min_samples_leaf'],
    
    max_features = f1_0828_try3_extratree['max_features']
)
# 모델 학습
model_extratree_f1.fit(X_train, y_train)

In [160]:
# 예측
y_pred_extratree = model_extratree_f1.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_extratree = accuracy_score(y_test, y_pred_extratree)
print(f"extratree Accuracy: {accuracy_extratree}")
print(f1_score(y_test, y_pred_extratree, average='binary'))
pred_ext = model_extratree_f1.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_ext)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_ext = model_extratree_f1.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_ext,return_counts = True))
test_pred_labels_ext = np.where(test_pred_ext == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_ext,return_counts = True))
print(np.unique(test_pred_labels_ext[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_ext = pd.read_csv("submission.csv")
df_sub_ext["target"] = test_pred_labels_ext
df_sub_ext.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_ext['target'],return_counts = True))
print(np.unique(df_sub_ext['target'],return_counts = True)[1][0]/np.unique(df_sub_ext['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_ext.loc[index, 'target'],return_counts= True))
df_sub_ext.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_ext['target'],return_counts = True))
# df_sub_ext.to_csv('submission.csv',index = False)

extratree Accuracy: 0.8453544476060868
0.2068527918781726
오차행렬:
 [[6670  972]
 [ 278  163]]

정확도: 0.8454
정밀도: 0.1436
재현율: 0.3696
F1: 0.2069
(array([0, 1]), array([14763,  2598]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 2598, 14763]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([10, 18]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 2616, 14745]))
0.17741607324516787
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 2619, 14742]))


In [None]:
df_sub_ext.to_csv('submission.csv',index = False) # 0.19047619047619047

### extra tree pre

In [130]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_uniform('max_features',0.1, 1.0),
#         'class_weight': 'balanced',  # 클래스 불균형에 대한 가중치
         'class_weight': {
            0: trial.suggest_uniform('class_weight_0', 1,2),
            1: trial.suggest_uniform('class_weight_1', 1, ratio)
        },
        'random_state': 42,
        'n_jobs' : -1
    }

    # 모델 생성 및 학습
    model = ExtraTreesClassifier(**param)
    model.fit(X_train, y_train)

    # 예측 및 평가
    y_pred = model.predict(X_test)
    score = precision_score(y_test, y_pred, average='binary')
#     score = f1_score(y_test, y_pred, average='binary')

    
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)


[I 2024-08-28 13:52:43,755] A new study created in memory with name: no-name-a868c7e1-b7f6-4c9a-94e1-f01385019bb9
[I 2024-08-28 13:52:45,195] Trial 0 finished with value: 0.4 and parameters: {'n_estimators': 270, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 0.10826657956220093, 'class_weight_0': 1.65828938295771, 'class_weight_1': 4.150704920553123}. Best is trial 0 with value: 0.4.
[I 2024-08-28 13:52:47,548] Trial 1 finished with value: 0.47058823529411764 and parameters: {'n_estimators': 372, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 0.19390731898906355, 'class_weight_0': 1.1706750896061449, 'class_weight_1': 3.843227247444197}. Best is trial 1 with value: 0.47058823529411764.
[I 2024-08-28 13:52:52,668] Trial 2 finished with value: 0.21739130434782608 and parameters: {'n_estimators': 225, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 0.48996299927650966, 'class_weight_0': 1.8278

[I 2024-08-28 13:55:26,887] Trial 25 finished with value: 0.7894736842105263 and parameters: {'n_estimators': 430, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 0.46104508414189393, 'class_weight_0': 1.7847542140984094, 'class_weight_1': 7.589278736708487}. Best is trial 18 with value: 0.8333333333333334.
[I 2024-08-28 13:55:32,386] Trial 26 finished with value: 0.8125 and parameters: {'n_estimators': 390, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 0.3160859863316954, 'class_weight_0': 1.6123099610164258, 'class_weight_1': 2.133007198027223}. Best is trial 18 with value: 0.8333333333333334.
[I 2024-08-28 13:55:34,281] Trial 27 finished with value: 0.6153846153846154 and parameters: {'n_estimators': 278, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 0.20955790231686514, 'class_weight_0': 1.9226412954754692, 'class_weight_1': 5.1412725201149785}. Best is trial 18 with value: 0.8333333333

Number of finished trials: 30
Best trial: {'n_estimators': 407, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3362388386929923, 'class_weight_0': 1.9839605132757119, 'class_weight_1': 7.290115160960667}


In [131]:
pre_ext_0828_try3 = {'n_estimators': 407, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3362388386929923, 'class_weight_0': 1.9839605132757119, 'class_weight_1': 7.290115160960667}

In [132]:
model_extratree_pre = ExtraTreesClassifier(
    random_state=42,
    class_weight = {0:pre_ext_0828_try3['class_weight_0'],1:pre_ext_0828_try3['class_weight_1']},
    n_estimators = pre_ext_0828_try3['n_estimators'],
    max_depth = pre_ext_0828_try3['max_depth'],
    min_samples_split = pre_ext_0828_try3['min_samples_split'],
    min_samples_leaf = pre_ext_0828_try3['min_samples_leaf'],
    n_jobs = -1,
    max_features = pre_ext_0828_try3['max_features']
)
# 모델 학습
model_extratree_pre.fit(X_train, y_train)

In [135]:
# 예측
y_pred_extratree = model_extratree_pre.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_extratree = accuracy_score(y_test, y_pred_extratree)
print(f"extratree Accuracy: {accuracy_extratree}")
print(f1_score(y_test, y_pred_extratree, average='binary'))
pred_ext = model_extratree_pre.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_ext)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_ext = model_extratree_pre.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_ext,return_counts = True))
test_pred_labels_ext = np.where(test_pred_ext == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_ext,return_counts = True))
print(np.unique(test_pred_labels_ext[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_ext = pd.read_csv("submission.csv")
df_sub_ext["target"] = test_pred_labels_ext
df_sub_ext.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_ext['target'],return_counts = True))
print(np.unique(df_sub_ext['target'],return_counts = True)[1][0]/np.unique(df_sub_ext['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_ext.loc[index, 'target'],return_counts= True))
df_sub_ext.loc[index,'target'] = 'AbNormal'

extratree Accuracy: 0.946925646418409
0.06535947712418301
오차행렬:
 [[7639    3]
 [ 426   15]]

정확도: 0.9469
정밀도: 0.8333
재현율: 0.0340
F1: 0.0654
(array([0, 1]), array([17314,    47]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([   47, 17314]))
(array(['Normal'], dtype='<U8'), array([28]))
(array(['AbNormal', 'Normal'], dtype=object), array([   75, 17286]))
0.0043387712599791736
(array(['AbNormal', 'Normal'], dtype=object), array([19,  5]))


In [142]:
k = set(df_sub_ext[df_sub_ext['target'] == 'AbNormal'].index)
len(k.intersection(a))

75

In [145]:
len(k.intersection(d))

74

# 보팅

In [156]:
from sklearn.ensemble import VotingClassifier

weights = []

# 앙상블을 위한 분류기 리스트 생성
classifiers = [
#     ('lgbm', model_lgbm),
    ('xgb_pre', model_xgb_pre),
    ('xgb_f1', model_xgb_f1),
#         ('xgb_pre', model_xgb_pre),
    ('lgbm_f1', model_lgbm_f1),
        ('cat_pre', model_catboost_pre),
    ('cat_f1', model_catboost_f1),
    ('extratree_pre',model_extratree_pre),
    ('extratree_f1',model_extratree_f1),
    

#     ('catboost', model_catboost),
#     ('decisiontree',model_dt)
]

# 소프트 투표 기반의 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=classifiers, voting='hard')

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1816, number of negative: 30514
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 32330, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056171 -> initscore=-2.821549
[LightGBM] [Info] Start training from score -2.821549


In [157]:
# 예측
y_pred_ensemble = ensemble_model.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"voting Accuracy: {accuracy_ensemble}")
print(f1_score(y_test, y_pred_ensemble, average='binary'))
pred_ensem = ensemble_model.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_ensem)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_ensemble = ensemble_model.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_ensemble,return_counts = True))
test_pred_labels_ensemble = np.where(test_pred_ensemble == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_ensemble,return_counts = True))
print(np.unique(test_pred_labels_ensemble[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_ens  = pd.read_csv("submission.csv")
df_sub_ens["target"] = test_pred_labels_ensemble
df_sub_ens.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_ens['target'],return_counts = True))
print(np.unique(df_sub_ens['target'],return_counts = True)[1][0]/np.unique(df_sub_ens['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))
df_sub_ens.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))

voting Accuracy: 0.8788816033650872
0.21742605915267785
오차행렬:
 [[6968  674]
 [ 305  136]]

정확도: 0.8789
정밀도: 0.1679
재현율: 0.3084
F1: 0.2174
(array([0, 1]), array([15555,  1806]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 1806, 15555]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([10, 18]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1824, 15537]))
0.1173971809229581
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))
(array(['AbNormal'], dtype=object), array([24]))


In [158]:
df_sub_ens.to_csv('submission.csv',index = False) #  0.1918767507002801

## 랜포 pre 기준으로 학습

In [162]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

# 데이터 분할 (예시)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
                'class_weight': {
            0: trial.suggest_uniform('class_weight_0', 1,2),
            1: trial.suggest_uniform('class_weight_1', 1, ratio)
        },
        'n_jobs': -1,
        'random_state': 42
    }

    # 모델 생성 및 학습
    model = RandomForestClassifier(**param)
    model.fit(X_train, y_train)
    
    # 예측 및 평가
    y_pred = model.predict(X_test)
    score = precision_score(y_test, y_pred)
    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 14:32:52,201] A new study created in memory with name: no-name-3fb79128-bdf1-4eae-b139-9aaefb750ba3
[I 2024-08-28 14:32:55,967] Trial 0 finished with value: 0.7272727272727273 and parameters: {'n_estimators': 415, 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.2830693694962557, 'bootstrap': True, 'class_weight_0': 1.5985875372243794, 'class_weight_1': 6.5947634644531306}. Best is trial 0 with value: 0.7272727272727273.
[I 2024-08-28 14:33:15,000] Trial 1 finished with value: 0.15512048192771086 and parameters: {'n_estimators': 467, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 0.43102647969191377, 'bootstrap': False, 'class_weight_0': 1.5982776959555407, 'class_weight_1': 12.218487133547402}. Best is trial 0 with value: 0.7272727272727273.
[I 2024-08-28 14:33:29,358] Trial 2 finished with value: 0.6451612903225806 and parameters: {'n_estimators': 255, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_l

Number of finished trials: 10
Best trial: {'n_estimators': 366, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 0.4535709631855813, 'bootstrap': False, 'class_weight_0': 1.8342880162332553, 'class_weight_1': 4.171231465384963}


In [163]:
pre_tree_0828_try3 = {'n_estimators': 366, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 0.4535709631855813, 'bootstrap': False, 'class_weight_0': 1.8342880162332553, 'class_weight_1': 4.171231465384963}

In [164]:
model_rf_pre = ExtraTreesClassifier(
    random_state=42,
    class_weight = {0:pre_tree_0828_try3['class_weight_0'],1:pre_tree_0828_try3['class_weight_1']},
    n_estimators = pre_tree_0828_try3['n_estimators'],
    n_jobs = -1,
    max_depth = pre_tree_0828_try3['max_depth'],
    min_samples_split = pre_tree_0828_try3['min_samples_split'],
    min_samples_leaf =pre_tree_0828_try3['min_samples_leaf'] ,
    max_features = pre_tree_0828_try3['max_features'],
    bootstrap = pre_tree_0828_try3['bootstrap'],
    
)
# 모델 학습
model_rf_pre.fit(X_train, y_train)

In [166]:
# 예측
y_pred_rf = model_rf_pre.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"rf Accuracy: {accuracy_rf}")
print(f1_score(y_test, y_pred_rf, average='binary'))
pred_rf = model_rf_pre.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_rf)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_rf = model_rf_pre.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_rf,return_counts = True))
test_pred_labels_rf = np.where(test_pred_rf == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_rf,return_counts = True))
print(np.unique(test_pred_labels_rf[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_rf = pd.read_csv("submission.csv")
df_sub_rf["target"] = test_pred_labels_rf
df_sub_rf.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_rf['target'],return_counts = True))
print(np.unique(df_sub_rf['target'],return_counts = True)[1][0]/np.unique(df_sub_rf['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_rf.loc[index, 'target'],return_counts= True))
df_sub_ext.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_rf['target'],return_counts = True))


rf Accuracy: 0.9468019299764939
0.0611353711790393
오차행렬:
 [[7639    3]
 [ 427   14]]

정확도: 0.9468
정밀도: 0.8235
재현율: 0.0317
F1: 0.0611
(array([0, 1]), array([17319,    42]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([   42, 17319]))
(array(['Normal'], dtype='<U8'), array([28]))
(array(['AbNormal', 'Normal'], dtype=object), array([   70, 17291]))
0.004048348852003933
(array(['AbNormal', 'Normal'], dtype=object), array([18,  6]))
(array(['AbNormal', 'Normal'], dtype=object), array([   70, 17291]))


In [167]:
b = df_sub_rf[df_sub_rf['target'] == 'AbNormal'].index

## 랜포 f1

In [175]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

# 데이터 분할 (예시)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna를 위한 목적 함수
def objective(trial):
    # 탐색할 하이퍼파라미터 설정
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_uniform('max_features', 0.1, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
                'class_weight': {
            0: trial.suggest_uniform('class_weight_0', 1,2),
            1: trial.suggest_uniform('class_weight_1', 1, ratio)
        },
        'n_jobs': -1,
        'random_state': 42
    }

    # 모델 생성 및 학습
    model = RandomForestClassifier(**param)
    model.fit(X_train, y_train)
    
    # 예측 및 평가
    y_pred = model.predict(X_test)
#     score = precision_score(y_test, y_pred)
    score = f1_score(y_test, y_pred, average='binary')

    return score

# Optuna Study 생성 및 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-08-28 14:44:06,814] A new study created in memory with name: no-name-4766a117-1633-4171-9fe8-b0690cc8cb73
[I 2024-08-28 14:44:18,960] Trial 0 finished with value: 0.0650759219088937 and parameters: {'n_estimators': 380, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 0.265190582875815, 'bootstrap': False, 'class_weight_0': 1.9750235143542398, 'class_weight_1': 2.7330898943887165}. Best is trial 0 with value: 0.0650759219088937.
[I 2024-08-28 14:44:38,291] Trial 1 finished with value: 0.17328519855595667 and parameters: {'n_estimators': 250, 'max_depth': 13, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 0.8309253762061721, 'bootstrap': True, 'class_weight_0': 1.628420877248494, 'class_weight_1': 15.146832888093916}. Best is trial 1 with value: 0.17328519855595667.
[I 2024-08-28 14:45:18,103] Trial 2 finished with value: 0.11262135922330097 and parameters: {'n_estimators': 440, 'max_depth': 13, 'min_samples_split': 10, 'min_samples

Number of finished trials: 10
Best trial: {'n_estimators': 211, 'max_depth': 14, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': 0.32012735812192705, 'bootstrap': False, 'class_weight_0': 1.6658697753417715, 'class_weight_1': 11.057642890370989}


In [176]:
f1_tree_0828_try3 = {'n_estimators': 252, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.2895392984281333, 'bootstrap': True, 'class_weight_0': 1.1446023134252585, 'class_weight_1': 12.472878769180666}

In [177]:
model_rf_f1 = ExtraTreesClassifier(
    random_state=42,
    class_weight = {0:f1_tree_0828_try3['class_weight_0'],1:f1_tree_0828_try3['class_weight_1']},
    n_estimators = f1_tree_0828_try3['n_estimators'],
    n_jobs = -1,
    max_depth = f1_tree_0828_try3['max_depth'],
    min_samples_split = f1_tree_0828_try3['min_samples_split'],
    min_samples_leaf =f1_tree_0828_try3['min_samples_leaf'] ,
    max_features = f1_tree_0828_try3['max_features'],
    bootstrap = f1_tree_0828_try3['bootstrap'],
    
)
# 모델 학습
model_rf_f1.fit(X_train, y_train)

In [178]:
# 예측
y_pred_rf = model_rf_f1.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"rf Accuracy: {accuracy_rf}")
print(f1_score(y_test, y_pred_rf, average='binary'))
pred_rf = model_rf_f1.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_rf)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_rf = model_rf_f1.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_rf,return_counts = True))
test_pred_labels_rf = np.where(test_pred_rf == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_rf,return_counts = True))
print(np.unique(test_pred_labels_rf[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_rf = pd.read_csv("submission.csv")
df_sub_rf["target"] = test_pred_labels_rf
df_sub_rf.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_rf['target'],return_counts = True))
print(np.unique(df_sub_rf['target'],return_counts = True)[1][0]/np.unique(df_sub_rf['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_rf.loc[index, 'target'],return_counts= True))
df_sub_ext.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_rf['target'],return_counts = True))


rf Accuracy: 0.8782630211555116
0.18811881188118812
오차행렬:
 [[6985  657]
 [ 327  114]]

정확도: 0.8783
정밀도: 0.1479
재현율: 0.2585
F1: 0.1881
(array([0, 1]), array([15563,  1798]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 1798, 15563]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 6, 22]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1820, 15541]))
0.1171095811080368
(array(['AbNormal', 'Normal'], dtype=object), array([20,  4]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1820, 15541]))


## 보팅 소프트

In [179]:
from sklearn.ensemble import VotingClassifier

weights = []

# 앙상블을 위한 분류기 리스트 생성
classifiers = [
#     ('lgbm', model_lgbm),
    ('xgb_pre', model_xgb_pre),
    ('xgb_f1', model_xgb_f1),
#         ('xgb_pre', model_xgb_pre),
    ('lgbm_f1', model_lgbm_f1),
        ('cat_pre', model_catboost_pre),
    ('cat_f1', model_catboost_f1),
    ('extratree_pre',model_extratree_pre),
    ('extratree_f1',model_extratree_f1),
    
     ('rf_pre',model_rf_pre),
    ('rf_f1',model_rf_f1),

#     ('catboost', model_catboost),
#     ('decisiontree',model_dt)
]

# 소프트 투표 기반의 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=classifiers, voting='hard')

# 앙상블 모델 학습
ensemble_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1816, number of negative: 30514
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 32330, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.056171 -> initscore=-2.821549
[LightGBM] [Info] Start training from score -2.821549


In [181]:
# 예측
y_pred_ensemble = ensemble_model.predict(X_test)

# 정확도 및 F1 점수 계산
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"voting Accuracy: {accuracy_ensemble}")
print(f1_score(y_test, y_pred_ensemble, average='binary'))
pred_ensem = ensemble_model.predict(X_test.fillna(0))
get_clf_eval(y_test, pred_ensem)
# 예측에 필요한 데이터 분리
x_test = test.drop(["target"], axis=1)
test_pred_ensemble = ensemble_model.predict(x_test.drop(['Set ID'],axis = 1))
# sum(test_pred) # True로 예측된 개수
print(np.unique(test_pred_ensemble,return_counts = True))
test_pred_labels_ensemble = np.where(test_pred_ensemble == 1, 'AbNormal', 'Normal')
print(len(test_index_0))
print(len(test_index_1))
print(np.unique(test_pred_labels_ensemble,return_counts = True))
print(np.unique(test_pred_labels_ensemble[test_index_0],return_counts = True))
#제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub_ens  = pd.read_csv("submission.csv")
df_sub_ens["target"] = test_pred_labels_ensemble
df_sub_ens.loc[test_index_0 , 'target'] = 'AbNormal'
# df_sub.loc[test_index_1 , 'target'] = test_pred_labels
print(np.unique(df_sub_ens['target'],return_counts = True))
print(np.unique(df_sub_ens['target'],return_counts = True)[1][0]/np.unique(df_sub_ens['target'],return_counts = True)[1][1])
index = [181,498,679,1510,1739,3055, 3687, 4618, 5311, 5702, 5886, 7075, 8354, 8414, 8898, 9043, 10188, 10191, 10345, 10424,10948, 14807, 15456, 16876]
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))
df_sub_ens.loc[index,'target'] = 'AbNormal'
print(np.unique(df_sub_ens.loc[index, 'target'],return_counts= True))
print(np.unique(df_sub_ens['target'],return_counts = True))


voting Accuracy: 0.896696771000866
0.20400381315538607
오차행렬:
 [[7141  501]
 [ 334  107]]

정확도: 0.8967
정밀도: 0.1760
재현율: 0.2426
F1: 0.2040
(array([0, 1]), array([15966,  1395]))
28
17333
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 1395, 15966]))
(array(['AbNormal', 'Normal'], dtype='<U8'), array([ 6, 22]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1417, 15944]))
0.08887355745107878
(array(['AbNormal', 'Normal'], dtype=object), array([21,  3]))
(array(['AbNormal'], dtype=object), array([24]))
(array(['AbNormal', 'Normal'], dtype=object), array([ 1420, 15941]))


In [182]:
df_sub_ens.to_csv('submission.csv',index = False) # 0.19669421487603309

In [183]:
list(X_train.columns)

['Model.Suffix_Dam',
 'Workorder_Dam',
 'CURE END POSITION X Collect Result_Dam',
 'CURE END POSITION Z Collect Result_Dam',
 'CURE END POSITION Θ Collect Result_Dam',
 'CURE SPEED Collect Result_Dam',
 'CURE START POSITION X Collect Result_Dam',
 'CURE START POSITION Θ Collect Result_Dam',
 'DISCHARGED SPEED OF RESIN Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
 'Dispense Volume(Stage1) Collect Result_Dam',
 'Dispense Volume(Stage2) Collect Result_Dam',
 'Dispense Volume(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Colle