# 제품 이상여부 판별 프로젝트

In [1]:
import numpy as np
import pandas as pd

## 모델링

### 데이터 구분

In [2]:
THRESHOLD = 0.3
RANDOM_STATE = 110

# csv 불러오기
train_data = pd.read_csv('train_data_0827.csv')
test_data = pd.read_csv('test_data_0827.csv')

In [3]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Receip_No_encoded',
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [4]:
# 전체 공통 변수
### train
var_all_train = [
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_suffix_encoded',
    'cleaned_workorder_encoded'
]

In [5]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [6]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [7]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [8]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]


In [9]:
# 각 DataFrame의 칼럼 수 계산
num_columns_train_data = train_data.shape[1]
num_columns_train_data_dam = train_data_dam.shape[1]
num_columns_train_data_autoclave = train_data_autoclave.shape[1]
num_columns_train_data_fill1 = train_data_fill1.shape[1]
num_columns_train_data_fill2 = train_data_fill2.shape[1]

num_columns_test_data = test_data.shape[1]
num_columns_test_data_dam = test_data_dam.shape[1]
num_columns_test_data_autoclave = test_data_autoclave.shape[1]
num_columns_test_data_fill1 = test_data_fill1.shape[1]
num_columns_test_data_fill2 = test_data_fill2.shape[1]

# 각 DataFrame의 칼럼 수 출력
print("----train data-----")
print(f"train_data DataFrame의 칼럼 수: {num_columns_train_data}")
print(f"train_data_dam DataFrame의 칼럼 수: {num_columns_train_data_dam}")
print(f"train_data_autoclave DataFrame의 칼럼 수: {num_columns_train_data_autoclave}")
print(f"train_data_fill1 DataFrame의 칼럼 수: {num_columns_train_data_fill1}")
print(f"train_data_fill2 DataFrame의 칼럼 수: {num_columns_train_data_fill2}")
print("----test data-----")
print(f"test_data DataFrame의 칼럼 수: {num_columns_test_data}")
print(f"test_data_dam DataFrame의 칼럼 수: {num_columns_test_data_dam}")
print(f"test_data_autoclave DataFrame의 칼럼 수: {num_columns_test_data_autoclave}")
print(f"test_data_fill1 DataFrame의 칼럼 수: {num_columns_test_data_fill1}")
print(f"test_data_fill2 DataFrame의 칼럼 수: {num_columns_test_data_fill2}")

----train data-----
train_data DataFrame의 칼럼 수: 40
train_data_dam DataFrame의 칼럼 수: 23
train_data_autoclave DataFrame의 칼럼 수: 8
train_data_fill1 DataFrame의 칼럼 수: 14
train_data_fill2 DataFrame의 칼럼 수: 14
----test data-----
test_data DataFrame의 칼럼 수: 41
test_data_dam DataFrame의 칼럼 수: 24
test_data_autoclave DataFrame의 칼럼 수: 9
test_data_fill1 DataFrame의 칼럼 수: 15
test_data_fill2 DataFrame의 칼럼 수: 15


### 모델 정의

In [10]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier

# 스레드홀드 설정
THRESHOLD = 0.3

# 모델 설정 및 하이퍼파라미터
models = {
    'et': ExtraTreesClassifier(),
    'rf': RandomForestClassifier(),
    'cat': CatBoostClassifier(),
    'lgbm': LGBMClassifier(),
    'xgb': XGBClassifier(),
    'dt': DecisionTreeClassifier(),
    'ada': AdaBoostClassifier()
}

def train_and_evaluate_model(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return
    
    # 데이터셋 분할
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # 모델 선택
    model = models[model_name].__class__()  # 새로운 모델 인스턴스 생성

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(x_train, y_train)

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    # 예측
    y_val_pred_proba = model.predict_proba(x_val)[:, 1]  # 양성 클래스 확률
    y_val_pred = (y_val_pred_proba >= THRESHOLD).astype(int)  # 스레드홀드에 따른 예측

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, zero_division=0)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'{model_name} 모델이 {data_name} 데이터로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

    return model  # 학습된 모델 반환

def fit_all_train_data_function(model_name, data, **params):
    if model_name not in models:
        print(f"{model_name}은(는) 지원되지 않는 모델입니다.")
        return None  # 지원되지 않는 모델일 경우 None 반환
    
    # 모델 선택
    model = models[model_name].__class__()  # 새로운 모델 인스턴스 생성

    # 하이퍼파라미터 설정
    model.set_params(**params)

    # 모델 학습
    model.fit(data.drop("target", axis=1), data["target"].map({'Normal': 0, 'AbNormal': 1}))

    # 데이터 이름을 자동으로 추출하기 위한 래퍼 함수
    data_name = [name for name in globals() if globals()[name] is data][0]

    print(f'{model_name} 모델이 {data_name} 데이터로 학습 완료')
    return model  # 학습된 모델 반환

def voting_function(data, estimators, voting='hard', threshold=0.5):
    # 데이터셋 분할 # voting='hard'일 경우 threshold는 사용되지 않음
    x_train, x_val, y_train, y_val = train_test_split(
        data.drop("target", axis=1),
        data["target"].map({'Normal': 0, 'AbNormal': 1}),
        test_size=0.2,
        shuffle=True,
        random_state=RANDOM_STATE,
    )

    # VotingClassifier 설정
    voting_clf = VotingClassifier(estimators=estimators, voting=voting)

    # 모델 학습
    voting_clf.fit(x_train, y_train)

    if voting == 'soft':
        # 소프트 보팅의 경우 확률 예측
        y_val_pred_proba = voting_clf.predict_proba(x_val)[:, 1]
        y_val_pred = (y_val_pred_proba >= threshold).astype(int)
    else:
        # 하드 보팅의 경우 직접 예측
        y_val_pred = voting_clf.predict(x_val)

    # 평가지표 계산
    f1 = f1_score(y_val, y_val_pred, average="binary")
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, zero_division=0)
    recall = recall_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    # 결과 출력
    print(f'Voting Classifier로 학습한 결과:')
    print(f'F1 Score: {f1}')
    print('---')
    print('Confusion Matrix:')
    print(conf_matrix)
    print('---')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print('\n')

    return voting_clf  # 학습된 VotingClassifier 반환

def voting(preds_or_probs, method='soft', threshold=0.3):
    """
    하드 보팅 또는 소프트 보팅을 사용하여 최종 예측을 수행합니다.

    Parameters:
    preds_or_probs (list of np.array): 각 모델의 예측 배열 리스트 (하드 보팅) 또는 예측 확률 배열 리스트 (소프트 보팅)
    method (str): 'soft' 또는 'hard' 보팅 방법 선택
    threshold (float): 소프트 보팅 시 예측을 양성으로 간주할 확률 임계값

    Returns:
    np.array: 최종 예측 결과
    """
    if method == 'soft':
        # 소프트 보팅: 각 모델의 확률 평균 계산
        soft_voting_probs = np.mean(preds_or_probs, axis=0)
        # 최종 예측: 평균 확률에 대해 스레드 홀드 적용
        final_predictions = (soft_voting_probs >= threshold).astype(int)
    elif method == 'hard':
        # 하드 보팅: 각 모델의 예측을 모아서 다수결 원칙 적용
        preds = np.array(preds_or_probs)
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=preds)
    else:
        raise ValueError("method 인자는 'soft' 또는 'hard'여야 합니다.")
    
    return final_predictions

### 공정별 모델 구축 (val 데이터)

- dam

In [12]:
train_model_lgbm = train_and_evaluate_model(
    'lgbm', train_data_dam
    , n_estimators=2470
    , num_leaves=2454
    , max_depth=26
    , learning_rate=0.06067228197373452
    , min_child_samples=134
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

lgbm 모델이 train_data_dam 데이터로 학습한 결과:
F1 Score: 0.22585034013605443
---
Confusion Matrix:
[[7450  212]
 [ 357   83]]
---
Accuracy: 0.9297704270550481
Precision: 0.28135593220338984
Recall: 0.18863636363636363




In [13]:
train_model_xgb = train_and_evaluate_model(
    'xgb', train_data_dam
    , n_estimators = 1244
    , learning_rate = 0.1258535425769987
    , max_depth = 26
    , alpha = 2.1820842842359597e-06
    , gamma = 0.00010809657684921935
    , reg_alpha = 0.5844029076359536
    , reg_lambda = 0.4748752246073433
    , colsample_bytree = 0.9607659760060685
    , subsample = 0.7147741317935203
    , objective = 'binary:logistic'
    , tree_method = 'exact'
    , random_state=RANDOM_STATE
)

xgb 모델이 train_data_dam 데이터로 학습한 결과:
F1 Score: 0.21862348178137653
---
Confusion Matrix:
[[7442  220]
 [ 359   81]]
---
Accuracy: 0.9285361639101456
Precision: 0.2691029900332226
Recall: 0.18409090909090908




In [14]:
train_model_cat = train_and_evaluate_model(
    'cat', train_data_dam,
    iterations = 1478, 
    learning_rate = 0.009068953796649421, 
    depth = 11, 
    min_data_in_leaf = 2,
    l2_leaf_reg = 1.187291687951122,
    random_strength = 0.43102541391012816, 
    bagging_temperature = 3.1790702578164853, 
    border_count = 155, 
    scale_pos_weight = 1.4418307437388553,
    grow_policy = 'Depthwise',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

cat 모델이 train_data_dam 데이터로 학습한 결과:
F1 Score: 0.25000000000000006
---
Confusion Matrix:
[[7514  148]
 [ 356   84]]
---
Accuracy: 0.9377931374969144
Precision: 0.3620689655172414
Recall: 0.19090909090909092




In [15]:
# VotingClassifier 사용 
estimators = [
    ('lgbm', train_model_lgbm)
    , ('xgb', train_model_xgb)
    , ('cat', train_model_cat)
]

# VotingClassifier 학습 및 평가
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.3)

Voting Classifier로 학습한 결과:
F1 Score: 0.22781065088757393
---
Confusion Matrix:
[[7503  159]
 [ 363   77]]
---
Accuracy: 0.9355714638360898
Precision: 0.326271186440678
Recall: 0.175




In [16]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.28)

Voting Classifier로 학습한 결과:
F1 Score: 0.22799422799422794
---
Confusion Matrix:
[[7488  174]
 [ 361   79]]
---
Accuracy: 0.9339669217477166
Precision: 0.31225296442687744
Recall: 0.17954545454545454




In [17]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.26)

Voting Classifier로 학습한 결과:
F1 Score: 0.2336578581363004
---
Confusion Matrix:
[[7467  195]
 [ 356   84]]
---
Accuracy: 0.9319921007158726
Precision: 0.3010752688172043
Recall: 0.19090909090909092




In [18]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.31)

Voting Classifier로 학습한 결과:
F1 Score: 0.22122571001494765
---
Confusion Matrix:
[[7507  155]
 [ 366   74]]
---
Accuracy: 0.93569489015058
Precision: 0.3231441048034934
Recall: 0.16818181818181818




- autoclave

In [19]:
train_model_lgbm = train_and_evaluate_model(
    'lgbm', train_data_autoclave
    , n_estimators=731
    , num_leaves=996
    , max_depth=273
    , learning_rate=0.0912254393922836
    , min_child_samples=195
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

lgbm 모델이 train_data_autoclave 데이터로 학습한 결과:
F1 Score: 0.23738872403560832
---
Confusion Matrix:
[[7508  154]
 [ 360   80]]
---
Accuracy: 0.9365588743520118
Precision: 0.3418803418803419
Recall: 0.18181818181818182




In [20]:
train_model_xgb = train_and_evaluate_model(
    'xgb', train_data_autoclave,
    n_estimators = 1152, 
    learning_rate = 0.02466611382982541, 
    max_depth = 29, 
    alpha = 2.9180083404308157e-05, 
    gamma = 0.00012667501319666823, 
    reg_alpha = 0.6903592486292155, 
    reg_lambda = 0.5638873235014423, 
    colsample_bytree = 0.9432782030604233, 
    subsample = 0.19192246128663584,
    objective = 'binary:logistic',  # 이진 분류
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

xgb 모델이 train_data_autoclave 데이터로 학습한 결과:
F1 Score: 0.24687933425797504
---
Confusion Matrix:
[[7470  192]
 [ 351   89]]
---
Accuracy: 0.9329795112317946
Precision: 0.3167259786476868
Recall: 0.20227272727272727




In [21]:
train_model_cat = train_and_evaluate_model(
    'cat', train_data_autoclave,
    iterations = 1299, 
    learning_rate =  0.03808793470493637, 
    depth = 9, 
    min_data_in_leaf = 5,
    l2_leaf_reg = 4.942829707223811, 
    random_strength = 3.804933757402697, 
    bagging_temperature = 1.3151583440997139, 
    border_count = 286, 
    scale_pos_weight = 1.9749286362629779,
    grow_policy = 'SymmetricTree',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

cat 모델이 train_data_autoclave 데이터로 학습한 결과:
F1 Score: 0.2529002320185615
---
Confusion Matrix:
[[7349  313]
 [ 331  109]]
---
Accuracy: 0.9205134534682794
Precision: 0.25829383886255924
Recall: 0.24772727272727274




In [22]:
# VotingClassifier 사용 
estimators = [
    ('lgbm', train_model_lgbm)
    , ('xgb', train_model_xgb)
    , ('cat', train_model_cat)
]

# VotingClassifier 학습 및 평가
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.3)

Voting Classifier로 학습한 결과:
F1 Score: 0.23123123123123118
---
Confusion Matrix:
[[7513  149]
 [ 363   77]]
---
Accuracy: 0.9368057269809923
Precision: 0.3407079646017699
Recall: 0.175




In [23]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.28)

Voting Classifier로 학습한 결과:
F1 Score: 0.2344428364688857
---
Confusion Matrix:
[[7492  170]
 [ 359   81]]
---
Accuracy: 0.9347074796346581
Precision: 0.32270916334661354
Recall: 0.18409090909090908




In [24]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.26)

Voting Classifier로 학습한 결과:
F1 Score: 0.24000000000000002
---
Confusion Matrix:
[[7464  198]
 [ 353   87]]
---
Accuracy: 0.9319921007158726
Precision: 0.30526315789473685
Recall: 0.19772727272727272




In [25]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.31)

Voting Classifier로 학습한 결과:
F1 Score: 0.23112480739599384
---
Confusion Matrix:
[[7528  134]
 [ 365   75]]
---
Accuracy: 0.9384102690693656
Precision: 0.3588516746411483
Recall: 0.17045454545454544




In [None]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.29)

In [None]:
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.24)

- fill1

In [None]:
train_model_lgbm = train_and_evaluate_model(
    'lgbm', train_data_fill1
    , n_estimators=821
    , num_leaves=1400
    , max_depth=52
    , learning_rate=0.002743887584386348
    , min_child_samples=231
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

In [None]:
train_model_xgb = train_and_evaluate_model(
    'xgb', train_data_fill1,
    n_estimators = 1899, 
    learning_rate = 0.011878583548993711, 
    max_depth = 12, 
    alpha = 0.004515243354832891,
    gamma = 0.0015693650802180896,
    reg_alpha = 0.7484424912256998, 
    reg_lambda = 0.27164326303977143, 
    colsample_bytree = 0.7901385059430825,
    subsample = 0.9924662032617025,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

In [None]:
train_model_cat = train_and_evaluate_model(
    'cat', train_data_fill1,
    iterations = 2842, 
    learning_rate = 0.01099464761153367, 
    depth = 4, 
    min_data_in_leaf = 3,
    l2_leaf_reg = 3.7373183252945945, 
    random_strength = 9.3675281753561, 
    bagging_temperature = 4.750112155842117, 
    border_count = 160, 
    scale_pos_weight = 2.53860325765727,
    grow_policy = 'Lossguide',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# VotingClassifier 사용 
estimators = [
    ('lgbm', train_model_lgbm)
    , ('xgb', train_model_xgb)
    , ('cat', train_model_cat)
]

# VotingClassifier 학습 및 평가
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.3)

- fill2

In [None]:
train_model_lgbm = train_and_evaluate_model(
    'lgbm', train_data_fill2
    , n_estimators=1005
    , num_leaves=2304
    , max_depth=293
    , learning_rate=0.08460539739469425
    , min_child_samples=272
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

In [None]:
train_model_xgb = train_and_evaluate_model(
    'xgb', train_data_fill2,
    n_estimators = 1162, 
    learning_rate = 0.014523070494025153, 
    max_depth = 8, 
    alpha = 0.00012198482017902725, 
    gamma = 0.001236902841680112, 
    reg_alpha = 0.7331637000614692, 
    reg_lambda = 0.5237223061096699, 
    colsample_bytree = 0.8250374170841293, 
    subsample = 0.31906427054137687,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

In [None]:
train_model_cat = train_and_evaluate_model(
    'cat', train_data_fill2,
    iterations = 1458, 
    learning_rate = 0.004706507801075929, 
    depth = 13, 
    min_data_in_leaf = 4,
    l2_leaf_reg = 1.909987690181427, 
    random_strength = 9.047942432889677, 
    bagging_temperature = 3.545210494821586, 
    border_count = 300, 
    scale_pos_weight = 3.4781865667208467,
    grow_policy = 'Lossguide',


    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# VotingClassifier 사용 
estimators = [
    ('lgbm', train_model_lgbm)
    , ('xgb', train_model_xgb)
    , ('cat', train_model_cat)
]

# VotingClassifier 학습 및 평가
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.3)

- all

In [None]:
train_model_lgbm = train_and_evaluate_model(
    'lgbm', train_data
    , n_estimators=1496
    , num_leaves=1611
    , max_depth=148
    , learning_rate=0.0822880159816304
    , min_child_samples=194
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

In [None]:
train_model_xgb = train_and_evaluate_model(
    'xgb', train_data,
    n_estimators = 2427,
    learning_rate = 0.010774204513905965, 
    max_depth = 17, 
    alpha = 0.0005233654110538582, 
    gamma = 5.551445919277608e-05, 
    reg_alpha = 0.9652805882189326, 
    reg_lambda = 0.3542856398135083, 
    colsample_bytree = 0.9094884645797131, 
    subsample = 0.1733751790853043,
    objective = 'binary:logistic',  # 이진 분류
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

In [None]:
train_model_cat = train_and_evaluate_model(
    'cat', train_data,
    iterations=1349,
    learning_rate=0.012526639112437014,
    depth=9,
    min_data_in_leaf=4,
    l2_leaf_reg=2.245006704049574,
    random_strength=0.6922797458293842,
    bagging_temperature=8.230635636022027,
    border_count=211,
    scale_pos_weight=2.0709015241138236,
    grow_policy='Depthwise',
    
    random_state=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

In [None]:
# VotingClassifier 사용 
estimators = [
    ('lgbm', train_model_lgbm)
    , ('xgb', train_model_xgb)
    , ('cat', train_model_cat)
]

# VotingClassifier 학습 및 평가
voting_clf_soft = voting_function(train_data, estimators, voting='soft', threshold=0.3)

### 공정별 모델 구축 (전체 데이터)

- dam

In [None]:
model_Dam_lgbm = fit_all_train_data_function(
    'lgbm', train_data_dam
    , n_estimators=2470
    , num_leaves=2454
    , max_depth=26
    , learning_rate=0.06067228197373452
    , min_child_samples=134
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_Dam_xgb = fit_all_train_data_function(
    'xgb', train_data_dam
    , n_estimators = 1244
    , learning_rate = 0.1258535425769987
    , max_depth = 26
    , alpha = 2.1820842842359597e-06
    , gamma = 0.00010809657684921935
    , reg_alpha = 0.5844029076359536
    , reg_lambda = 0.4748752246073433
    , colsample_bytree = 0.9607659760060685
    , subsample = 0.7147741317935203
    , objective = 'binary:logistic'
    , tree_method = 'exact'
    , random_state=RANDOM_STATE
)

model_Dam_cat = fit_all_train_data_function(
    'cat', train_data_dam,
    iterations = 1478, 
    learning_rate = 0.009068953796649421, 
    depth = 11, 
    min_data_in_leaf = 2,
    l2_leaf_reg = 1.187291687951122,
    random_strength = 0.43102541391012816, 
    bagging_temperature = 3.1790702578164853, 
    border_count = 155, 
    scale_pos_weight = 1.4418307437388553,
    grow_policy = 'Depthwise',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# 예측에 필요한 데이터 분리
x_test_dam = test_data_dam.drop(["target", "Set ID"], axis=1)

# 모델별 예측 확률
proba_lgbm = model_Dam_lgbm.predict_proba(x_test_dam)[:, 1]
proba_xgb = model_Dam_xgb.predict_proba(x_test_dam)[:, 1]
proba_cat = model_Dam_cat.predict_proba(x_test_dam)[:, 1]

# dam 예측 확률 평균
proba_dam = (proba_lgbm + proba_xgb + proba_cat)/3

- autoclave

In [None]:
model_AutoClave_lgbm = fit_all_train_data_function(
    'lgbm', train_data_autoclave
    , n_estimators=731
    , num_leaves=996
    , max_depth=273
    , learning_rate=0.0912254393922836
    , min_child_samples=195
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_AutoClave_xgb = fit_all_train_data_function(
    'xgb', train_data_autoclave,
    n_estimators = 1152, 
    learning_rate = 0.02466611382982541, 
    max_depth = 29, 
    alpha = 2.9180083404308157e-05, 
    gamma = 0.00012667501319666823, 
    reg_alpha = 0.6903592486292155, 
    reg_lambda = 0.5638873235014423, 
    colsample_bytree = 0.9432782030604233, 
    subsample = 0.19192246128663584,
    objective = 'binary:logistic',  # 이진 분류
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

model_AutoClave_cat = fit_all_train_data_function(
    'cat', train_data_autoclave,
    iterations = 1299, 
    learning_rate =  0.03808793470493637, 
    depth = 9, 
    min_data_in_leaf = 5,
    l2_leaf_reg = 4.942829707223811, 
    random_strength = 3.804933757402697, 
    bagging_temperature = 1.3151583440997139, 
    border_count = 286, 
    scale_pos_weight = 1.9749286362629779,
    grow_policy = 'SymmetricTree',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# 예측에 필요한 데이터 분리
x_test_autoclave = test_data_autoclave.drop(["target", "Set ID"], axis=1)

# 모델별 예측 확률
proba_lgbm = model_AutoClave_lgbm.predict_proba(x_test_autoclave)[:, 1]
proba_xgb = model_AutoClave_xgb.predict_proba(x_test_autoclave)[:, 1]
proba_cat = model_AutoClave_cat.predict_proba(x_test_autoclave)[:, 1]

# autoclave 예측 확률 평균
proba_autoclave = (proba_lgbm + proba_xgb + proba_cat)/3

- fill1

In [None]:
model_Fill1_lgbm = fit_all_train_data_function(
    'lgbm', train_data_fill1
    , n_estimators=821
    , num_leaves=1400
    , max_depth=52
    , learning_rate=0.002743887584386348
    , min_child_samples=231
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_Fill1_xgb = fit_all_train_data_function(
    'xgb', train_data_fill1,
    n_estimators = 1899, 
    learning_rate = 0.011878583548993711, 
    max_depth = 12, 
    alpha = 0.004515243354832891,
    gamma = 0.0015693650802180896,
    reg_alpha = 0.7484424912256998, 
    reg_lambda = 0.27164326303977143, 
    colsample_bytree = 0.7901385059430825,
    subsample = 0.9924662032617025,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

model_Fill1_cat = fit_all_train_data_function(
    'cat', train_data_fill1,
    iterations = 2842, 
    learning_rate = 0.01099464761153367, 
    depth = 4, 
    min_data_in_leaf = 3,
    l2_leaf_reg = 3.7373183252945945, 
    random_strength = 9.3675281753561, 
    bagging_temperature = 4.750112155842117, 
    border_count = 160, 
    scale_pos_weight = 2.53860325765727,
    grow_policy = 'Lossguide',

    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# 예측에 필요한 데이터 분리
x_test_fill1 = test_data_fill1.drop(["target", "Set ID"], axis=1)

# 모델별 예측 확률
proba_lgbm = model_Fill1_lgbm.predict_proba(x_test_fill1)[:, 1]
proba_xgb = model_Fill1_xgb.predict_proba(x_test_fill1)[:, 1]
proba_cat = model_Fill1_cat.predict_proba(x_test_fill1)[:, 1]

# fill1 예측 확률 평균
proba_fill1 = (proba_lgbm + proba_xgb + proba_cat)/3

- fill2

In [None]:
model_Fill2_lgbm = fit_all_train_data_function(
    'lgbm', train_data_fill2
    , n_estimators=1005
    , num_leaves=2304
    , max_depth=293
    , learning_rate=0.08460539739469425
    , min_child_samples=272
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_Fill2_xgb = fit_all_train_data_function(
    'xgb', train_data_fill2,
    n_estimators = 1162, 
    learning_rate = 0.014523070494025153, 
    max_depth = 8, 
    alpha = 0.00012198482017902725, 
    gamma = 0.001236902841680112, 
    reg_alpha = 0.7331637000614692, 
    reg_lambda = 0.5237223061096699, 
    colsample_bytree = 0.8250374170841293, 
    subsample = 0.31906427054137687,
    objective = 'binary:logistic',
    tree_method = 'exact',
    random_state=RANDOM_STATE
)

model_Fill2_cat = fit_all_train_data_function(
    'cat', train_data_fill2,
    iterations = 1458, 
    learning_rate = 0.004706507801075929, 
    depth = 13, 
    min_data_in_leaf = 4,
    l2_leaf_reg = 1.909987690181427, 
    random_strength = 9.047942432889677, 
    bagging_temperature = 3.545210494821586, 
    border_count = 300, 
    scale_pos_weight = 3.4781865667208467,
    grow_policy = 'Lossguide',


    random_state = RANDOM_STATE,
    eval_metric = 'F1',
    logging_level = 'Silent',
    boosting_type = 'Plain'
)

In [None]:
# 예측에 필요한 데이터 분리
x_test_fill2 = test_data_fill2.drop(["target", "Set ID"], axis=1)

# 모델별 예측 확률
proba_lgbm = model_Fill2_lgbm.predict_proba(x_test_fill2)[:, 1]
proba_xgb = model_Fill2_xgb.predict_proba(x_test_fill2)[:, 1]
proba_cat = model_Fill2_cat.predict_proba(x_test_fill2)[:, 1]

# fill2 예측 확률 평균
proba_fill2 = (proba_lgbm + proba_xgb + proba_cat)/3

- all

In [None]:
model_All_lgbm = fit_all_train_data_function(
    'lgbm', train_data
    , n_estimators=1496
    , num_leaves=1611
    , max_depth=148
    , learning_rate=0.0822880159816304
    , min_child_samples=194
    , boosting_type='dart'
    , random_state=RANDOM_STATE
    , verbose=-1
)

model_All_xgb = fit_all_train_data_function(
    'xgb', train_data,
    n_estimators = 2427,
    learning_rate = 0.010774204513905965, 
    max_depth = 17, 
    alpha = 0.0005233654110538582, 
    gamma = 5.551445919277608e-05, 
    reg_alpha = 0.9652805882189326, 
    reg_lambda = 0.3542856398135083, 
    colsample_bytree = 0.9094884645797131, 
    subsample = 0.1733751790853043,
    objective = 'binary:logistic',  # 이진 분류
    tree_method = "exact", 
    random_state=RANDOM_STATE
)

model_All_cat = fit_all_train_data_function(
    'cat', train_data,
    iterations=1349,
    learning_rate=0.012526639112437014,
    depth=9,
    min_data_in_leaf=4,
    l2_leaf_reg=2.245006704049574,
    random_strength=0.6922797458293842,
    bagging_temperature=8.230635636022027,
    border_count=211,
    scale_pos_weight=2.0709015241138236,
    grow_policy='Depthwise',
    
    random_state=RANDOM_STATE,
    eval_metric='F1',
    logging_level='Silent',
    boosting_type='Plain'
)

In [None]:
# 예측에 필요한 데이터 분리
x_test_all = train_data.drop(["target", "Set ID"], axis=1)

# 모델별 예측 확률
proba_lgbm = model_All_lgbm.predict_proba(x_test_all)[:, 1]
proba_xgb = model_All_xgb.predict_proba(x_test_all)[:, 1]
proba_cat = model_All_cat.predict_proba(x_test_all)[:, 1]

# fill2 예측 확률 평균
proba_all = (proba_lgbm + proba_xgb + proba_cat)/3