# Bayesian Optimization

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.1-py2.py3-none-any.whl (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.10.1


In [6]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
df_path = '/content/drive/MyDrive/모델 튜닝/train.csv'
df = pd.read_csv(df_path)

In [8]:
X = df.drop(['person_id', 'login'], axis=1)
y = df['login']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:

# Define the search space for hyperparameters
search_space = {
    'n_estimators': Integer(10, 1000),
    'criterion': Categorical(['gini', 'entropy']),
    'max_depth': Integer(1, 50, prior='uniform'),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
    'min_weight_fraction_leaf': Real(0.0, 0.5),
    'max_features': Categorical(['auto', 'sqrt', 'log2', None]),
    'max_leaf_nodes': Integer(10, 1000),
    'min_impurity_decrease': Real(0.0, 0.2),
    'bootstrap': Categorical([True, False])
}

# Initialize the Bayesian optimizer with the RandomForest model and the search space
bayes_cv = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_space,
    n_iter=30, # Number of iterations
    cv=5,      # 5-fold cross-validation
    n_jobs=-1, # Use all available cores
    random_state=42
)

# Fit the BayesSearchCV to find the best hyperparameters
bayes_cv.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("Best Parameters:", bayes_cv.best_params_)

# 최적의 모델 성능 출력
print("Best Score:", bayes_cv.best_score_)

Best Parameters: OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 45), ('max_features', None), ('max_leaf_nodes', 995), ('min_impurity_decrease', 0.006155506559345229), ('min_samples_leaf', 8), ('min_samples_split', 8), ('min_weight_fraction_leaf', 0.0008500985179336569), ('n_estimators', 378)])
Best Score: 0.9159398496240602


In [10]:
# submission
submit = pd.read_csv('/content/drive/MyDrive/모델 튜닝/sample_submission.csv')

# GridSearchCV로 찾은 최적의 파라미터
best_params = bayes_cv.best_params_

# 찾은 최적의 파라미터들을 제출 양식에 맞게 업데이트
for param, value in best_params.items():
    if param in submit.columns:
        submit.loc[0, param] = value

# 제출 파일 저장 경로
submit_file_path = 'submit.csv'
submit.to_csv(submit_file_path, index=False)

# 제출 파일 다운로드
from google.colab import files
files.download('submit.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>