# KampVoter

## Import Library

### Basic Libary

In [4]:
from kamp.preprocess import KampDataLoader
from kamp.models import KampVoter

from sklearn.model_selection import RandomizedSearchCV

### Models

In [5]:
# Forest Models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Boosting Models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Linear Models
from sklearn.linear_model import LogisticRegression

# Discriminan Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Distance Based Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NN Models
from sklearn.neural_network import MLPClassifier

## Data Load

In [6]:
DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'


data_loader = KampDataLoader(
    path = DATA_PATH,

    # 처리 안한게 더 좋았음
    # 처리 안한 것 : 0.944
    # 처리 한 것 : 최대 0.922
    do_count_trend=False,
    drop_count=False,

    get_useful_p_data=True,
    p_threshold=0.05,

    outlier_method='iso',
    iso_outlier_rate=0.0075,

    do_resample=False,
    # downsampled_pass_rate=1.0,
    # upsampled_fail_rate_about_pass=0.10,
    # upsample_method='adasyn',

    scale_include_cat=False
)

data_loader.process()

data = data_loader.load()

x_train = data['train_data']
y_train = data['train_label']
x_test = data['test_data']
y_test=  data['test_label']


[Process Log] Loading Raw Data...
[Process Log] Done

[Process Log] Processing Nan Value...
[Process Log] Done

[Process Log] Encoding Categorical Features...
[Process Log] Done

[Process Log] Removing Outliers (IsoForest)...
[Outlier-Remover Log] With Outliers Shape : (89753, 23)
[Outlier-Remover Log] Without Outliers Shape : (89079, 23)
[Process Log] Done

[Process Log] T-Testing...
[Process Log] Done

[Process Log] Data Scaling (MinMaxScaler)...
[Process Log] Done

[Process Log] Train Test Spliting...
[Process Log] Done



In [None]:
test = x_test.copy()
test['passorfail'] = y_test
test.to_csv('test_output.csv')
test

Unnamed: 0,count,facility_operation_cycleTime,production_cycletime,low_section_speed,high_section_speed,cast_pressure,biscuit_thickness,upper_mold_temp1,upper_mold_temp2,lower_mold_temp1,lower_mold_temp2,sleeve_temperature,tryshot_signal,working,EMS_operation_time,mold_code,heating_furnace,passorfail
0,0.753754,0.128866,0.248963,0.733333,0.288660,0.927273,0.127962,0.057423,0.037337,0.670487,0.089636,0.318565,0.0,1.0,23,6.0,0.0,0.0
1,0.867868,0.121134,0.244813,0.733333,0.288660,0.890909,0.111374,0.140056,0.031153,0.320917,0.142157,0.111111,0.0,1.0,6,0.0,1.0,0.0
2,0.756757,0.126289,0.255187,0.700000,0.275773,0.895455,0.113744,0.133053,0.037099,0.598854,0.166667,0.270745,0.0,1.0,23,5.0,2.0,0.0
3,0.483483,0.128866,0.248963,0.733333,0.288660,0.918182,0.109005,0.108543,0.022592,0.716332,0.116246,0.309423,0.0,1.0,23,5.0,2.0,0.0
4,0.279279,0.131443,0.253112,0.733333,0.288660,0.918182,0.125592,0.057423,0.035910,0.590258,0.088936,0.317862,0.0,1.0,23,6.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22265,0.507508,0.121134,0.240664,0.733333,0.286082,0.922727,0.125592,0.163165,0.045660,0.702006,0.142857,0.188467,0.0,1.0,0,6.0,0.0,0.0
22266,0.579580,0.126289,0.251037,0.933333,0.371134,0.931818,0.113744,0.107143,0.028300,0.429799,0.196078,0.343179,0.0,1.0,23,0.0,2.0,0.0
22267,0.042042,0.121134,0.244813,0.733333,0.288660,0.931818,0.130332,0.057423,0.032105,0.618911,0.084034,0.319972,0.0,1.0,23,6.0,1.0,0.0
22268,0.624625,0.118557,0.242739,0.733333,0.288660,0.913636,0.113744,0.113445,0.036861,0.687679,0.110644,0.271449,0.0,1.0,23,6.0,2.0,0.0


## Modeling

### Best Version

```python
voting_models = {
    'catboost' : best_catboost,
    'lgbm' : best_lgbm,
    'xgb' : best_xgb,
}

model_weights = [1.5,1,1]

kamp_voter = KampVoter(voting_models=voting_models,
                       model_weights=model_weights, 
                       voting_method='soft')
```

```python
"-----------------------------------------------------------"
f1_score : 0.9614791987673343

confusion matrix : 
[[17142    18]
 [   32   624]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.95      0.96       656

    accuracy                           1.00     17816
   macro avg       0.99      0.98      0.98     17816
weighted avg       1.00      1.00      1.00     17816
```

### Base Model Tuning

#### CatBoostClassifier

In [7]:
# cat_param_grid = {
#     'iterations' : [1000, 1500],
#     'learning_rate' : [0.01, 0.03, 0.05],
#     'depth' : [4, 6],
#     'l2_leaf_reg' : [1, 3, 5],
#     'random_seed' : [42],
#     'verbose' : [0]
# }

# random_search = RandomizedSearchCV(
#     estimator = CatBoostClassifier(),
#     param_distributions = cat_param_grid,
#     n_iter = 20,
#     cv = 3,
#     verbose = 2,
#     random_state = 42,
#     n_jobs = -1
# )

# random_search.fit(x_train, y_train)

# print(f"Best HyperParameters : {random_search.best_params_}")

# best_catboost = random_search.best_estimator_

#### LGBMClassifier

In [8]:
# # 2m 20s
# lgbm_param_grid = {
#     'boosting_type' : ['gbdt', 'dart'],
#     # 'n_estimators' : [100, 150, 200, 250],
#     'learning_rate' : [0.01, 0.1, 0.5],
#     'max_depth' : [-1],
#     'random_state' : [42],
#     'verbose' : [0]
# }

# random_search = RandomizedSearchCV(
#     estimator = LGBMClassifier(),
#     param_distributions = lgbm_param_grid,
#     n_iter = 30,
#     cv = 3,
#     verbose = 2,
#     random_state = 42,
#     n_jobs = -1
# )

# random_search.fit(x_train, y_train)

# print(f"Best HyperParameters : {random_search.best_params_}")

# best_lgbm = random_search.best_estimator_

#### XGBClassifier

In [9]:
# xgb_param_grid = {
#     # 'booster' : ['gbtree'],
#     'n_estimators' : [100, 150, 200],
#     'learning_rate' : [0.1, 0.3, 0.5],
#     'max_depth' : [10, 12, 14, 16],
#     'random_state' : [42],
#     'verbosity' : [0]
# }

# random_search = RandomizedSearchCV(
#     estimator = XGBClassifier(),
#     param_distributions = xgb_param_grid,
#     n_iter = 30,
#     cv = 3,
#     verbose = 2,
#     random_state = 42,
#     n_jobs = -1
# )

# random_search.fit(x_train, y_train)

# print(f"Best HyperParameters : {random_search.best_params_}")

# best_xgb = random_search.best_estimator_

### Voting Model Tuning

In [28]:
voting_models = {
    # 'catboost' : best_catboost,
    # 'lgbm' : best_lgbm,
    # 'xgb' : best_xgb,
    'catboost' : CatBoostClassifier(random_state=42, verbose=0),
    'lgbm' : LGBMClassifier(random_state=42),
    'xgb' : XGBClassifier(random_state=42),
    # 'rf' : RandomForestClassifier(random_state=42,verbose=0)
}

model_weights = [
                 1.0,
                 1.5,
                 1.0,
                #  1.0
                 ]

kamp_voter = KampVoter(voting_models = voting_models,
                       model_weights = model_weights, 
                       voting_method = 'soft')

In [29]:
kamp_voter.fit(x_train, y_train)

[Voting] ................. (1 of 3) Processing catboost, total=   5.0s
[LightGBM] [Info] Number of positive: 2459, number of negative: 64350
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2109
[LightGBM] [Info] Number of data points in the train set: 66809, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036806 -> initscore=-3.264582
[LightGBM] [Info] Start training from score -3.264582
[Voting] ..................... (2 of 3) Processing lgbm, total=   0.2s
[Voting] ...................... (3 of 3) Processing xgb, total=   0.1s


## Evaluation

In [30]:
kamp_voter.evaluate(x_train, y_train)

f1_score : 0.996328029375765

confusion matrix : 
[[64349     1]
 [   17  2442]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     64350
         1.0       1.00      0.99      1.00      2459

    accuracy                           1.00     66809
   macro avg       1.00      1.00      1.00     66809
weighted avg       1.00      1.00      1.00     66809




In [31]:
kamp_voter.evaluate(x_test, y_test)

f1_score : 0.9572755417956657

confusion matrix : 
[[21428    22]
 [   47   773]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     21450
         1.0       0.97      0.94      0.96       820

    accuracy                           1.00     22270
   macro avg       0.99      0.97      0.98     22270
weighted avg       1.00      1.00      1.00     22270


