# Data 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install iterative-stratification
!pip install optuna
!pip install --upgrade category_encoders
!pip install catboost

In [None]:
!pip install flaml

In [None]:
# @title
import sys
import numpy as np
import pandas as pd
import os


# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA


# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss


# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

from flaml import AutoML

# Hide convergence warning for now
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
# Preprocessing, Feature Engineering 완료한 데이터 불러오기
X_train = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/최종/X_train_pre.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/최종/X_test_pre.csv')
y_train = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/최종/y_train_pre.csv')
pd.set_option('display.max_columns', None)

# Automl flaml 시작

In [None]:
# Automl을 위해 values로 바꿈
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values

## LGBM

In [None]:
auto_model = AutoML()
settings = {
    "metric": 'log_loss', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['lgbm'], # list of ML learners; we tune xgboost in this example
    "task": 'classification', # task type
    "log_file_name": 'carbon_monoxide_predictor.log',
    "time_budget": 100# flaml log file
}
auto_model.fit(X_train=X_train, y_train=y_train, **settings) # X_train, y_train

[flaml.automl.logger: 07-27 07:00:06] {1693} INFO - task = classification
[flaml.automl.logger: 07-27 07:00:06] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 07-27 07:00:06] {1703} INFO - Evaluation method: cv
[flaml.automl.logger: 07-27 07:00:06] {1801} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 07-27 07:00:06] {1911} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 07-27 07:00:06] {2221} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 07-27 07:00:07] {2347} INFO - Estimated sufficient time budget=10942s. Estimated necessary time budget=11s.
[flaml.automl.logger: 07-27 07:00:07] {2394} INFO -  at 1.1s,	estimator lgbm's best error=0.4293,	best estimator lgbm's best error=0.4293
[flaml.automl.logger: 07-27 07:00:07] {2221} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 07-27 07:00:08] {2394} INFO -  at 1.8s,	estimator lgbm's best error=0.4293,	best estimator lgbm's best error=0.4293
[flaml.aut

In [None]:
print('Best estimator:', auto_model.best_estimator)
print('Best hyperparmeter config:', auto_model.best_config)
print('Training duration of best run: {0:.4g} s'.format(auto_model.best_config_train_time))
print(auto_model.model.estimator)

Best estimator: lgbm
Best hyperparmeter config: {'n_estimators': 94, 'num_leaves': 9, 'min_child_samples': 4, 'learning_rate': 0.20204929944793612, 'log_max_bin': 10, 'colsample_bytree': 0.8302082011667893, 'reg_alpha': 0.003148211294343311, 'reg_lambda': 0.003165384302202214}
Training duration of best run: 0.3767 s
LGBMClassifier(colsample_bytree=0.8302082011667893,
               learning_rate=0.20204929944793612, max_bin=1023,
               min_child_samples=4, n_estimators=94, num_leaves=9,
               reg_alpha=0.003148211294343311, reg_lambda=0.003165384302202214,
               verbose=-1)


## xgboost

In [None]:
auto_model = AutoML()
settings = {
    "metric": 'log_loss', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['xgboost'], # list of ML learners; we tune xgboost in this example
    "task": 'classification', # task type
    "log_file_name": 'carbon_monoxide_predictor.log',
    "time_budget": 60# flaml log file
}
auto_model.fit(X_train=X_train, y_train=y_train, **settings) # X_train, y_train

print('Best estimator:', auto_model.best_estimator)
print('Best hyperparmeter config:', auto_model.best_config)
print('Training duration of best run: {0:.4g} s'.format(auto_model.best_config_train_time))

[flaml.automl.logger: 07-27 07:12:33] {1693} INFO - task = classification
[flaml.automl.logger: 07-27 07:12:33] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 07-27 07:12:33] {1703} INFO - Evaluation method: cv
[flaml.automl.logger: 07-27 07:12:33] {1801} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 07-27 07:12:33] {1911} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 07-27 07:12:33] {2221} INFO - iteration 0, current learner xgboost
[flaml.automl.logger: 07-27 07:12:33] {2347} INFO - Estimated sufficient time budget=1253s. Estimated necessary time budget=1s.
[flaml.automl.logger: 07-27 07:12:33] {2394} INFO -  at 0.1s,	estimator xgboost's best error=0.5133,	best estimator xgboost's best error=0.5133
[flaml.automl.logger: 07-27 07:12:33] {2221} INFO - iteration 1, current learner xgboost
[flaml.automl.logger: 07-27 07:12:33] {2394} INFO -  at 0.3s,	estimator xgboost's best error=0.5133,	best estimator xgboost's best erro

In [None]:
auto_model.model.estimator

## Catboost

In [None]:
auto_model = AutoML()
settings = {
    "metric": 'log_loss', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['catboost'], # list of ML learners; we tune xgboost in this example
    "task": 'classification', # task type
    "log_file_name": 'carbon_monoxide_predictor.log',
    "time_budget": 60# flaml log file
}
auto_model.fit(X_train=X_train, y_train=y_train, **settings) # X_train, y_train

print('Best estimator:', auto_model.best_estimator)
print('Best hyperparmeter config:', auto_model.best_config)
print('Training duration of best run: {0:.4g} s'.format(auto_model.best_config_train_time))

[flaml.automl.logger: 07-27 07:14:10] {1693} INFO - task = classification
[flaml.automl.logger: 07-27 07:14:10] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 07-27 07:14:10] {1703} INFO - Evaluation method: cv
[flaml.automl.logger: 07-27 07:14:10] {1801} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 07-27 07:14:10] {1911} INFO - List of ML learners in AutoML Run: ['catboost']
[flaml.automl.logger: 07-27 07:14:10] {2221} INFO - iteration 0, current learner catboost
[flaml.automl.logger: 07-27 07:14:15] {2347} INFO - Estimated sufficient time budget=51236s. Estimated necessary time budget=51s.
[flaml.automl.logger: 07-27 07:14:15] {2394} INFO -  at 5.1s,	estimator catboost's best error=0.0575,	best estimator catboost's best error=0.0575
[flaml.automl.logger: 07-27 07:14:15] {2221} INFO - iteration 1, current learner catboost
[flaml.automl.logger: 07-27 07:14:25] {2394} INFO -  at 14.8s,	estimator catboost's best error=0.0567,	best estimator catboost's

In [None]:
auto_model.model.estimator

<catboost.core.CatBoostClassifier at 0x7e84399217b0>

## RandomForest

In [None]:
auto_model = AutoML()
settings = {
    "metric": 'log_loss', # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": ['rf'], # list of ML learners; we tune xgboost in this example
    "task": 'classification', # task type
    "log_file_name": 'carbon_monoxide_predictor.log',
    "time_budget": 60# flaml log file
}
auto_model.fit(X_train=X_train, y_train=y_train, **settings) # X_train, y_train

print('Best estimator:', auto_model.best_estimator)
print('Best hyperparmeter config:', auto_model.best_config)
print('Training duration of best run: {0:.4g} s'.format(auto_model.best_config_train_time))

[flaml.automl.logger: 07-27 07:09:11] {1693} INFO - task = classification
[flaml.automl.logger: 07-27 07:09:11] {1700} INFO - Data split method: stratified
[flaml.automl.logger: 07-27 07:09:11] {1703} INFO - Evaluation method: cv
[flaml.automl.logger: 07-27 07:09:11] {1801} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 07-27 07:09:11] {1911} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl.logger: 07-27 07:09:11] {2221} INFO - iteration 0, current learner rf
[flaml.automl.logger: 07-27 07:09:12] {2347} INFO - Estimated sufficient time budget=2408s. Estimated necessary time budget=2s.
[flaml.automl.logger: 07-27 07:09:12] {2394} INFO -  at 0.3s,	estimator rf's best error=0.3387,	best estimator rf's best error=0.3387
[flaml.automl.logger: 07-27 07:09:12] {2221} INFO - iteration 1, current learner rf
[flaml.automl.logger: 07-27 07:09:12] {2394} INFO -  at 0.5s,	estimator rf's best error=0.2755,	best estimator rf's best error=0.2755
[flaml.automl.logger: 07-2

In [None]:
print(auto_model.model.estimator)

RandomForestClassifier(criterion='entropy', max_features=0.7720255259265024,
                       max_leaf_nodes=114, n_estimators=59, n_jobs=-1,
                       random_state=12032022)
