# KampVoter

## Import Library

### Basic Libary

In [2]:
from kamp.preprocess import KampDataLoader
from kamp.models import KampVoter

from sklearn.model_selection import RandomizedSearchCV

### Models

In [3]:
# Forest Models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Boosting Models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Linear Models
from sklearn.linear_model import LogisticRegression

# Discriminan Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Distance Based Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NN Models
from sklearn.neural_network import MLPClassifier

## Data Load

In [7]:
DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'


data_loader = KampDataLoader(
    path = DATA_PATH,

    # 처리 안한게 더 좋았음
    # 처리 안한 것 : 0.944
    # 처리 한 것 : 최대 0.922
    do_count_trend=False,
    drop_count=False,

    get_useful_p_data=True,
    p_threshold=0.05,

    outlier_method='iso',
    iso_outlier_rate=0.0075,

    do_resample=False,
    # downsampled_pass_rate=1.0,
    # upsampled_fail_rate_about_pass=0.10,
    # upsample_method='adasyn',

    scale_include_cat=True
)

data_loader.process()

data = data_loader.load()

x_train = data['train_data'].drop('passorfail',axis=1)
y_train = data['train_label']
x_test = data['test_data'].drop('passorfail',axis=1)
y_test=  data['test_label']


[Process Log] Loading Raw Data...
[Process Log] Done

[Process Log] Processing Nan Value...
[Process Log] Done

[Process Log] Encoding Categorical Features...
[Process Log] Done

[Process Log] Removing Outliers (IsoForest)...
[Outlier-Remover Log] With Outliers Shape : (89753, 23)
[Outlier-Remover Log] Without Outliers Shape : (89079, 23)
[Process Log] Done

[Process Log] T-Testing...
[Process Log] Done

[Process Log] Data Scaling (MinMaxScaler)...
[Process Log] Done

[Process Log] Train Test Spliting...
[Process Log] Done



In [9]:
x_test

Unnamed: 0,count,working,facility_operation_cycleTime,production_cycletime,low_section_speed,high_section_speed,cast_pressure,biscuit_thickness,upper_mold_temp1,upper_mold_temp2,lower_mold_temp1,lower_mold_temp2,sleeve_temperature,EMS_operation_time,tryshot_signal,mold_code,heating_furnace
0,0.576577,1.0,0.118557,0.278008,0.733333,0.288660,0.927273,0.120853,0.091737,0.037099,0.464183,0.159664,0.051336,0.00,0.0,0.500000,0.0
1,0.291291,1.0,0.131443,0.253112,0.733333,0.288660,0.922727,0.120853,0.149860,0.039477,0.544413,0.130952,0.404360,0.92,0.0,0.333333,0.0
2,0.243243,1.0,0.123711,0.246888,0.733333,0.288660,0.922727,0.132701,0.057423,0.033532,0.593123,0.086835,0.325598,0.92,0.0,1.000000,0.0
3,0.819820,1.0,0.126289,0.275934,0.733333,0.288660,0.918182,0.118483,0.123950,0.038526,0.710602,0.114846,0.271449,0.92,0.0,1.000000,1.0
4,0.759760,1.0,0.123711,0.244813,0.733333,0.288660,0.918182,0.125592,0.060224,0.037812,0.673352,0.089636,0.316456,0.92,0.0,1.000000,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17811,0.648649,1.0,0.136598,0.255187,0.733333,0.288660,0.918182,0.113744,0.148459,0.043282,0.575931,0.125350,0.263713,0.92,0.0,0.166667,1.0
17812,0.021021,1.0,0.144330,0.170124,0.726667,0.288660,0.140909,0.127962,0.133754,0.026873,0.318052,0.097339,0.308720,0.92,0.0,0.333333,0.5
17813,0.336336,1.0,0.128866,0.251037,0.700000,0.273196,0.890909,0.109005,0.127451,0.035196,0.550143,0.161765,0.309423,0.92,0.0,0.833333,1.0
17814,0.099099,1.0,0.193299,0.244813,0.733333,0.291237,0.927273,0.118483,0.133053,0.029251,0.358166,0.144258,0.094233,0.24,0.0,0.000000,0.5


## Modeling

### Best Version

```python
voting_models = {
    'catboost' : best_catboost,
    'lgbm' : best_lgbm,
    'xgb' : best_xgb,
}

model_weights = [1.5,1,1]

kamp_voter = KampVoter(voting_models=voting_models,
                       model_weights=model_weights, 
                       voting_method='soft')
```

```python
"-----------------------------------------------------------"
f1_score : 0.9614791987673343

confusion matrix : 
[[17142    18]
 [   32   624]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.95      0.96       656

    accuracy                           1.00     17816
   macro avg       0.99      0.98      0.98     17816
weighted avg       1.00      1.00      1.00     17816
```

### Base Model Tuning

#### CatBoostClassifier

In [None]:
cat_param_grid = {
    'iterations' : [1000, 1500],
    'learning_rate' : [0.01, 0.03, 0.05],
    'depth' : [4, 6],
    'l2_leaf_reg' : [1, 3, 5],
    # 'loss_function' : ['CrossEntropy'],
    'random_seed' : [42],
    'verbose' : [0],
}

random_search = RandomizedSearchCV(
    estimator = CatBoostClassifier(
                           task_type="GPU",
                           devices='0'
                           ),
    param_distributions = cat_param_grid,
    n_iter = 20,
    cv = 3,
    verbose = 2,
    random_state = 42,
    # n_jobs = -1
)

random_search.fit(x_train, y_train)

print(f"Best HyperParameters : {random_search.best_params_}")

best_catboost = random_search.best_estimator_
# Best HyperParameters : {'verbose': 0, 'random_seed': 42, 'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 1500, 'depth': 6}

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END depth=6, iterations=1500, l2_leaf_reg=5, learning_rate=0.05, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=6, iterations=1500, l2_leaf_reg=5, learning_rate=0.05, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=6, iterations=1500, l2_leaf_reg=5, learning_rate=0.05, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=4, iterations=1500, l2_leaf_reg=3, learning_rate=0.03, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=4, iterations=1500, l2_leaf_reg=3, learning_rate=0.03, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=4, iterations=1500, l2_leaf_reg=3, learning_rate=0.03, random_seed=42, verbose=0; total time= 1.6min
[CV] END depth=6, iterations=1000, l2_leaf_reg=5, learning_rate=0.05, random_seed=42, verbose=0; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=5, learning_rate=0.05, random_seed=42, verbose=0; total time= 1.0min
[CV] END de

#### LGBMClassifier

In [None]:
# 2m 20s
lgbm_param_grid = {
    # 'objective' : ['binary'],
    'boosting_type' : ['gbdt', 'dart'],
    # 'n_estimators' : [100, 150, 200, 250],
    'learning_rate' : [0.01, 0.1, 0.5],
    'max_depth' : [-1],
    'random_state' : [42],
    'verbose' : [0]
}

random_search = RandomizedSearchCV(
    estimator = LGBMClassifier(),
    param_distributions = lgbm_param_grid,
    n_iter = 30,
    cv = 3,
    verbose = 2,
    random_state = 42,
    n_jobs = -1
)

random_search.fit(x_train, y_train)

print(f"Best HyperParameters : {random_search.best_params_}")

best_lgbm = random_search.best_estimator_
# {'verbose': 0, 'random_state': 42, 'max_depth': -1, 'learning_rate': 0.1, 'boosting_type': 'gbdt'}

Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV] END boosting_type=gbdt, learning_rate=0.5, max_depth=-1, random_state=42, verbose=0; total time=16.3min
[CV] END boosting_type=gbdt, learning_rate=0.5, max_depth=-1, random_state=42, verbose=0; total time=20.8min
[CV] END boosting_type=dart, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=21.5min
[CV] END boosting_type=dart, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=21.7min
[CV] END boosting_type=dart, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=21.7min
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=21.9min
[CV] END boosting_type=gbdt, learning_rate=0.5, max_depth=-1, random_state=42, verbose=0; total time=22.0min
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=22.2min
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=-1, random_state=42, verbose=0; total time=22.2min
[CV] END boos

#### XGBClassifier

In [None]:
xgb_param_grid = {
    # 'booster' : ['gbtree'],
    'n_estimators' : [100, 150, 200],
    'learning_rate' : [0.1, 0.3, 0.5],
    'max_depth' : [10, 12, 14, 16],
    'random_state' : [42],
    'verbosity' : [0]
}

random_search = RandomizedSearchCV(
    estimator = XGBClassifier(tree_method="hist", device="cuda"),
    param_distributions = xgb_param_grid,
    n_iter = 30,
    cv = 3,
    verbose = 2,
    random_state = 42,
    # n_jobs = -1
)

random_search.fit(x_train, y_train)

print(f"Best HyperParameters : {random_search.best_params_}")

best_xgb = random_search.best_estimator_
# {'verbosity': 0, 'random_state': 42, 'n_estimators': 150, 'max_depth': 16, 'learning_rate': 0.5}

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END learning_rate=0.5, max_depth=16, n_estimators=200, random_state=42, verbosity=0; total time=   1.8s
[CV] END learning_rate=0.5, max_depth=16, n_estimators=200, random_state=42, verbosity=0; total time=   1.6s
[CV] END learning_rate=0.5, max_depth=16, n_estimators=200, random_state=42, verbosity=0; total time=   1.6s
[CV] END learning_rate=0.3, max_depth=10, n_estimators=150, random_state=42, verbosity=0; total time=   1.3s
[CV] END learning_rate=0.3, max_depth=10, n_estimators=150, random_state=42, verbosity=0; total time=   1.1s
[CV] END learning_rate=0.3, max_depth=10, n_estimators=150, random_state=42, verbosity=0; total time=   1.1s
[CV] END learning_rate=0.5, max_depth=10, n_estimators=200, random_state=42, verbosity=0; total time=   1.2s
[CV] END learning_rate=0.5, max_depth=10, n_estimators=200, random_state=42, verbosity=0; total time=   1.2s
[CV] END learning_rate=0.5, max_depth=10, n_estimators=200, random_

### Voting Model Tuning

In [88]:
voting_models = {
    'catboost' : best_catboost,
    'lgbm' : best_lgbm,
    'xgb' : best_xgb,
    # 'rf' : RandomForestClassifier(random_state=42)
}

model_weights = [1.5,
                 1.0,
                 1.0,
                #  1.0
                 ]

kamp_voter = KampVoter(voting_models = voting_models,
                       model_weights = model_weights, 
                       voting_method = 'soft')

In [89]:
kamp_voter.fit(x_train, y_train)

[Voting] ................. (1 of 3) Processing catboost, total=  16.2s
[Voting] ..................... (2 of 3) Processing lgbm, total=   0.3s
[Voting] ...................... (3 of 3) Processing xgb, total=   0.9s


## Evaluation

In [90]:
kamp_voter.evaluate(x_train, y_train)

f1_score : 0.9992369324685234

confusion matrix : 
[[68640     0]
 [    4  2619]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     68640
         1.0       1.00      1.00      1.00      2623

    accuracy                           1.00     71263
   macro avg       1.00      1.00      1.00     71263
weighted avg       1.00      1.00      1.00     71263




In [91]:
kamp_voter.evaluate(x_test, y_test)

f1_score : 0.9606177606177606

confusion matrix : 
[[17143    17]
 [   34   622]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.95      0.96       656

    accuracy                           1.00     17816
   macro avg       0.99      0.97      0.98     17816
weighted avg       1.00      1.00      1.00     17816




In [87]:
# CatBoost
# [[17138    22]
#  [   35   621]]

# LGBM
# [[17139    21]
#  [   34   622]]

# XGB
# [[17140    20]
#  [   34   622]]
