# KampStacking, Voting

## Import Library

### Basic Libary

In [1]:
from kamp.preprocess import KampDataLoader
from kamp.models import KampVoter

### Models

In [17]:
# Forest Models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# Boosting Models
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Linear Models
from sklearn.linear_model import LogisticRegression

# Discriminan Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Distance Based Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NN Models
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from xgboost import XGBClassifier  # Ensure this is installed
from lightgbm import LGBMClassifier  # Ensure this is installed
from catboost import CatBoostClassifier  # Ensure this is installed
import matplotlib.pyplot as plt
import seaborn as sns


## Data Load

In [3]:
DATA_PATH = './data/경진대회용 주조 공정최적화 데이터셋.csv'


data_loader = KampDataLoader(
    path = DATA_PATH,

    # 처리 안한게 더 좋았음
    # 처리 안한 것 : 0.944
    # 처리 한 것 : 최대 0.922
    do_count_trend=False,
    drop_count=False,

    get_useful_p_data=True,
    p_threshold=0.05,

    outlier_method='iso',
    iso_outlier_rate=0.0075,

    do_resample=False,
    # downsampled_pass_rate=1.0,
    # upsampled_fail_rate_about_pass=1.0,
    # upsample_method='adasyn',

    scale_include_cat=False
)

data_loader.process()

data = data_loader.load()

x_train = data['train_data']
y_train = data['train_label']
x_test = data['test_data']
y_test=  data['test_label']


[Process Log] Loading Raw Data...
[Process Log] Done

[Process Log] Processing Nan Value...
[Process Log] Done

[Process Log] Encoding Categorical Features...
[Process Log] Done

[Process Log] Removing Outliers (IsoForest)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:,feature] = data.loc[:, feature].fillna(fill_val)


[Outlier-Remover Log] With Outliers Shape : (89753, 23)
[Outlier-Remover Log] Without Outliers Shape : (89079, 23)
[Process Log] Done

[Process Log] T-Testing...
[Process Log] Done

[Process Log] Data Scaling (MinMaxScaler)...
[Process Log] Done

[Process Log] Train Test Spliting...
[Process Log] Done



In [4]:
x_train

Unnamed: 0,count,facility_operation_cycleTime,production_cycletime,low_section_speed,high_section_speed,cast_pressure,biscuit_thickness,upper_mold_temp1,upper_mold_temp2,lower_mold_temp1,lower_mold_temp2,sleeve_temperature,tryshot_signal,working,EMS_operation_time,mold_code,heating_furnace
0,0.519520,0.141753,0.248963,0.700000,0.273196,0.890909,0.106635,0.067927,0.035672,0.512894,0.121148,0.253868,0.0,1.0,23,5.0,2.0
1,0.309309,0.134021,0.259336,0.733333,0.288660,0.918182,0.104265,0.146359,0.037099,0.650430,0.151961,0.248945,0.0,1.0,23,1.0,2.0
2,0.048048,0.146907,0.259336,0.733333,0.288660,0.922727,0.130332,0.144258,0.037812,0.363897,0.128151,0.317159,0.0,1.0,23,2.0,0.0
3,0.330330,0.123711,0.244813,0.733333,0.288660,0.922727,0.132701,0.057423,0.033532,0.595989,0.086835,0.326301,0.0,1.0,23,6.0,1.0
4,0.279279,0.126289,0.248963,0.733333,0.288660,0.922727,0.087678,0.074230,0.031391,0.621777,0.057423,0.316456,0.0,1.0,23,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71258,0.147147,0.131443,0.253112,0.733333,0.288660,0.913636,0.113744,0.118347,0.036147,0.679083,0.108543,0.278481,0.0,1.0,23,6.0,2.0
71259,0.324324,0.113402,0.240664,0.666667,0.262887,0.927273,0.109005,0.081232,0.030678,0.538682,0.121148,0.322785,0.0,1.0,3,0.0,0.0
71260,0.753754,0.159794,0.273859,0.693333,0.275773,0.895455,0.111374,0.121148,0.034483,0.524355,0.156863,0.358650,0.0,1.0,23,5.0,2.0
71261,0.288288,0.136598,0.251037,0.733333,0.288660,0.918182,0.099526,0.108543,0.023781,0.458453,0.173669,0.302391,0.0,1.0,23,5.0,2.0


## Modeling

In [None]:
# Base models with increased max_iter for LogisticRegression
base_models = [
    ('lr', LogisticRegression(random_state=42, max_iter=10000)),
    ('lgbm', LGBMClassifier(random_state=42, verbose=0)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=4)), # 4
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', n_jobs=-1)),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1, criterion='gini')),
    ('cat', CatBoostClassifier(random_seed=42, verbose=0)),
    # ('ada', AdaBoostClassifier(random_state=42)),
    ('svm', SVC(random_state=42))
]

meta_model = LogisticRegression(max_iter=10000)  # Increased max_iter for meta model as well

# Create the Stacking Classifier
clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, n_jobs=-1)

# Fit the model (assuming x_train and y_train are defined)
clf.fit(x_train, y_train)

# Make predictions (assuming y_test and x_test are defined)
y_pred = clf.predict(x_test)

# Print F1 score (macro average for multi-class classification)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score (Macro Average): {f1:.4f}")

# Compute confusion matrix and print it
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# [[17143    17]
#  [   33   623]]

F1 Score (Macro Average): 0.9800
Confusion Matrix:
[[17143    17]
 [   33   623]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.95      0.96       656

    accuracy                           1.00     17816
   macro avg       0.99      0.97      0.98     17816
weighted avg       1.00      1.00      1.00     17816



[Best Version]

```python
voting_models = {
    'knn' : KNeighborsClassifier(n_neighbors=10),
    'catboost' : CatBoostClassifier(random_state=42, verbose=0),
    'rf' : RandomForestClassifier(random_state=42),
    'lgbm' : LGBMClassifier(random_state=42, verbose=0),
    'xgb' : XGBClassifier(random_state=42, verbose=0)
}
```

```python
"-----------------------------------------------------------"
f1_score : 0.9541569541569541

confusion matrix : 
[[17143    17]
 [   42   614]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.94      0.95       656

    accuracy                           1.00     17816
   macro avg       0.99      0.97      0.98     17816
weighted avg       1.00      1.00      1.00     17816
```

In [60]:
voting_models = {
    'catboost' : CatBoostClassifier(random_state=42, verbose=0),
    'rf' : RandomForestClassifier(random_state=42),
    'lgbm' : LGBMClassifier(random_state=42, verbose=0),
    'xgb' : XGBClassifier(random_state=42, verbose=0),
    # 'gb' : GradientBoostingClassifier(random_state=42),
    # 'lr': LogisticRegression(random_state=42, max_iter=10000),
    # 'knn': KNeighborsClassifier(n_neighbors=4), # 4
    # 'ada': AdaBoostClassifier(random_state=42),
    # 'svm': SVC(random_state=42)
}

kamp_voter = KampVoter(voting_models=voting_models, voting_method='soft')

In [61]:
kamp_voter.fit(x_train, y_train)

[Voting] ................. (1 of 4) Processing catboost, total=   5.4s
[Voting] ....................... (2 of 4) Processing rf, total=   8.6s
[Voting] ..................... (3 of 4) Processing lgbm, total=   2.3s


Parameters: { "verbose" } are not used.



[Voting] ...................... (4 of 4) Processing xgb, total=   0.8s


## Evaluation

In [62]:
kamp_voter.evaluate(x_train, y_train)

f1_score : 0.9971324794494361

confusion matrix : 
[[68640     0]
 [   15  2608]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     68640
         1.0       1.00      0.99      1.00      2623

    accuracy                           1.00     71263
   macro avg       1.00      1.00      1.00     71263
weighted avg       1.00      1.00      1.00     71263




In [63]:
kamp_voter.evaluate(x_test, y_test)

f1_score : 0.9606784888203547

confusion matrix : 
[[17142    18]
 [   33   623]]

classification report : 
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17160
         1.0       0.97      0.95      0.96       656

    accuracy                           1.00     17816
   macro avg       0.98      0.97      0.98     17816
weighted avg       1.00      1.00      1.00     17816




    KNN
    [[17143    17]
    [   99   557]]

    SVC
    [[17157     3]
    [  157   499]]

    Ada
    [[17126    34]
    [   95   561]]

    Cat
    [[17140    20]
    [   38   618]]

    RF
    [[17137    23]
    [   34   622]]

    GB
    [[17134    26]
    [   53   603]]

    LGBM
    [[17139    21]
    [   34   622]]

    LR
    [[17152     8]
    [  119   537]]

    XGB
    [[17138    22]
    [   46   610]]

    RF, LGBM, XGB, Cat - soft
    [[17142    18]
    [   33   623]]