# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import numpy as np
import pandas as pd
import optuna

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred_proba, threshold=0.5):
    # 확률을 기준으로 예측 레이블 생성
    y_pred = (y_pred_proba >= threshold).astype(int)  # 0.5 이상의 확률을 양성으로 간주

    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Confusion Matrix:\n", confusion)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [2]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

### 데이터 읽어오기


In [3]:
THRESHOLD = 0.3
RANDOM_STATE = 110

## --- 해당하는 데이터로 변경해주기!! --- ##
train_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/train_data_0817.csv")
test_data = pd.read_csv("C:/Users/KimDongyoung/Desktop/git_LGaimers5/Lg_aimers5/data/test_data_0817.csv")

In [4]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [5]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

In [6]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [7]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [8]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [9]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

---

## 3. 모델 학습

### optuna

1. lightgbm

In [10]:
# def objectiveLGBM_dart(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
#         'num_leaves': trial.suggest_int('num_leaves', 500, 3000),
#         'max_depth': trial.suggest_int('max_depth', 10, 300),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
#         'min_child_samples': trial.suggest_int('min_child_samples', 3, 300),
#         'boosting_type': 'dart', 
#         'random_state': RANDOM_STATE,
#         'verbose': -1
#     }
       
#     model = LGBMClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1), # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],              # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveLGBM_dart(trial, x_train, y_train, x_val, y_val), n_trials=300)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

2. xgboost

In [11]:
# def objectiveXGB(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})

#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1200, 3200)
#         , 'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.2)
#         , 'max_depth': trial.suggest_int('max_depth', 3, 10)

#         , 'alpha': trial.suggest_float('alpha', 0.00001, 0.01, log=True)
#         , 'gamma': trial.suggest_float('gamma', 0.00001, 0.01, log=True)

#         , 'reg_alpha' : trial.suggest_float('reg_alpha', 0.01, 1)
#         , 'reg_lambda' : trial.suggest_float('reg_lambda', 0.01, 1)
        
#         , 'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.01, 1)
#         , 'subsample' : trial.suggest_float('subsample', 0.03, 1)
#         , 'objective': 'binary:logistic'  
#         , 'tree_method' : "exact"        
#         , 'random_state': RANDOM_STATE
#     }
       
#     model = XGBClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),  # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],               # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveXGB(trial, x_train, y_train, x_val, y_val), n_trials=200)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


3. catboost

In [12]:
# def objectiveCatBoost(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
#     param = {
#         'iterations': trial.suggest_int('iterations', 400, 1500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.4),
#         'depth': trial.suggest_int('depth', 3, 14),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.05, 5.0, log=True),
#         'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
#         'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 10.0),
#         'border_count': trial.suggest_int('border_count', 70, 270),
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 1.2),

#         'random_seed': RANDOM_STATE,
#         'eval_metric': 'F1',
#         'logging_level': 'Silent',
#         'boosting_type': 'Plain'
#     }

#     model = CatBoostClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),  # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],               # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveCatBoost(trial, x_train, y_train, x_val, y_val), n_trials=100)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


4. randomforest

In [13]:
# def objectiveRandomForestClassifier(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 2500)
#         , 'max_depth': trial.suggest_int('max_depth', 2, 50)
#         , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 12)
#         , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 7)
#         , 'criterion': trial.suggest_categorical('criterion', ['entropy'])
#         , 'bootstrap': trial.suggest_categorical('bootstrap', [False])
#         , 'random_state': RANDOM_STATE
#     }
    
#     model = RandomForestClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred = model.predict(x_val)
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),  # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],               # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveRandomForestClassifier(trial, x_train, y_train, x_val, y_val), n_trials=200)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

5. extra tree

In [14]:

# def objectiveExtraTreesClassifier(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 2500)
#         , 'max_depth': trial.suggest_int('max_depth', 2, 50)
#         , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 12)
#         , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 7)
#         , 'criterion': trial.suggest_categorical('criterion', ['entropy'])
#         , 'bootstrap': trial.suggest_categorical('bootstrap', [False])
#         , 'random_state': RANDOM_STATE
#     }
    
#     model = ExtraTreesClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred = model.predict(x_val)
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),  # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],               # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveExtraTreesClassifier(trial, x_train, y_train, x_val, y_val), n_trials=400)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


6. adaboost

In [15]:
# def objectiveAdaBoost(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})

#     # Base Estimator 설정
#     base_estimator = DecisionTreeClassifier(
#         max_depth=trial.suggest_int('max_depth', 1, 10),
#         min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
#         min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 20),
#         max_features=trial.suggest_float('max_features', 0.1, 1.0),
#         random_state=RANDOM_STATE  
#     )

#     # AdaBoost 모델 설정
#     param = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#         'base_estimator': base_estimator,
#         'random_state': RANDOM_STATE  
#     }
    
#     model = AdaBoostClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred = model.predict(x_val)
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1),  # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],               # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveAdaBoost(trial, x_train, y_train, x_val, y_val), n_trials=100)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


7. Gradient boost

In [16]:
def objectiveGBM(trial, x_tr, y_tr, x_val, y_val):
    
    # 'Normal'과 'AbNormal'을 숫자로 변환
    y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
    y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 300),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100),
        'random_state': RANDOM_STATE
    }
       
    model = GradientBoostingClassifier(**param)
    model.fit(x_tr, y_tr)
    pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop("target", axis=1), # <--- 해당하는 데이터로 변경해주기!!
    train_data["target"],              # <--- 해당하는 데이터로 변경해주기!!
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(lambda trial: objectiveGBM(trial, x_train, y_train, x_val, y_val), n_trials=300)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-08-23 23:31:55,826] A new study created in memory with name: no-name-3285e6d6-b36d-4654-8c95-973008d08ad9
[I 2024-08-23 23:41:32,406] Trial 0 finished with value: 0.23347398030942335 and parameters: {'n_estimators': 790, 'max_depth': 199, 'learning_rate': 0.038183309313687845, 'min_samples_split': 185, 'min_samples_leaf': 68}. Best is trial 0 with value: 0.23347398030942335.
[I 2024-08-24 00:13:08,014] Trial 1 finished with value: 0.2344213649851632 and parameters: {'n_estimators': 2206, 'max_depth': 287, 'learning_rate': 0.06408507099977911, 'min_samples_split': 155, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.2344213649851632.
[I 2024-08-24 00:33:18,447] Trial 2 finished with value: 0.21654135338345865 and parameters: {'n_estimators': 1469, 'max_depth': 89, 'learning_rate': 0.014404705911514527, 'min_samples_split': 5, 'min_samples_leaf': 42}. Best is trial 1 with value: 0.2344213649851632.
[I 2024-08-24 00:38:23,156] Trial 3 finished with value: 0.22058823529411764

KeyboardInterrupt: 

Trial 9 finished with value: 0.23744292237442927 and parameters: {'n_estimators': 532, 'max_depth': 229, 'learning_rate': 0.019083553963246288, 'min_samples_split': 239, 'min_samples_leaf': 96}. Best is trial 9 with value: 0.23744292237442927.

8. Decision Tree

In [None]:
# def objectiveDecisionTree(trial, x_tr, y_tr, x_val, y_val):
    
#     # 'Normal'과 'AbNormal'을 숫자로 변환
#     y_tr = y_tr.map({'Normal': 0, 'AbNormal': 1})
#     y_val = y_val.map({'Normal': 0, 'AbNormal': 1})
    
#     param = {
#         'max_depth': trial.suggest_int('max_depth', 1, 30),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
#         'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
#         'random_state': RANDOM_STATE
#     }
       
#     model = DecisionTreeClassifier(**param)
#     model.fit(x_tr, y_tr)
#     pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
#     pred = (pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측
    
#     score = f1_score(y_val, pred, average="binary")
    
#     return score

# # 데이터셋 분할
# x_train, x_val, y_train, y_val = train_test_split(
#     train_data.drop("target", axis=1), # <--- 해당하는 데이터로 변경해주기!!
#     train_data["target"],              # <--- 해당하는 데이터로 변경해주기!!
#     test_size=0.2,
#     shuffle=True,
#     random_state=RANDOM_STATE,
# )

# # 하이퍼 파라미터 튜닝
# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
# study.optimize(lambda trial: objectiveDecisionTree(trial, x_train, y_train, x_val, y_val), n_trials=300)

# print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))