<a href="https://colab.research.google.com/github/genji970/LGBM_optimization/blob/main/lgbm_basic_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lightgbm
import lightgbm as lgb



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from sklearn.model_selection import train_test_split

from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
import numpy as np
import pandas as pd
import torch

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

In [29]:
df=df.dropna(axis=0)

df_target = df['target']
df=df.drop(['stock_id','imbalance_buy_sell_flag','row_id','target'],axis = 1)

In [31]:
# 데이터 로드 및 분할
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.2, random_state=42)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [32]:
# 하이퍼파라미터 공간 정의
param_space = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'max_depth': hp.quniform('max_depth', 5, 15, 1),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1.0),
    'bagging_freq': hp.quniform('bagging_freq', 1, 10, 1)
}

In [34]:
# 목표 함수 정의
def objective(params):
    params['num_leaves'] = int(params['num_leaves'])  # num_leaves는 정수로 변환
    params['max_depth'] = int(params['max_depth'])
    params['bagging_freq'] = int(params['bagging_freq'])

    # 모델 학습
    model = lgb.train(params,
                      train_data,
                      num_boost_round=100,
                      valid_sets=[test_data],
                      #early_stopping_rounds=10,
                      #verbose_eval=False)
    )

    # 예측 및 평가
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    loss = np.sqrt(np.mean((y_test - y_pred) ** 2))  # RMSE 계산

    return {'loss': loss, 'status': STATUS_OK}

# 최적화 수행
trials = Trials()
best_params = fmin(fn=objective,
                   space=param_space,
                   algo=tpe.suggest,
                   max_evals=50,
                   trials=trials)

# 최적의 하이퍼파라미터 출력
print("Best Parameters:", best_params)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 1874910, number of used features: 13
[LightGBM] [Info] Start training from score -0.041933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 1874910, number of used features: 13
[LightGBM] [Info] Start training from score -0.041933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047595 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 1874910, number of used features: 13
[LightGBM] [Info] S

In [46]:
# 하이퍼파라미터 공간 정의
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': 0.191646018876034,
    'num_leaves': int(149),
    'max_depth': int(11),
    'feature_fraction': 0.8493447931778073,
    'bagging_fraction': 0.805204985743,
    'bagging_freq': 4,
}

In [48]:
model = lgb.train(params,
                  train_data,
                  num_boost_round=100,            # 부스팅 반복 횟수
                  valid_sets=[test_data],         # 검증 데이터셋
                  #early_stopping_rounds=10,       # 조기 종료 설정
                  #verbose_eval=10)                # 10회 마다 결과 출력
)

# 7. 테스트 데이터 예측
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# 8. 모델 성능 평가 (RMSE)
mse = ((y_test - y_pred)**2)/len(y_test)
print(f'MSE: {mse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 1874910, number of used features: 13
[LightGBM] [Info] Start training from score -0.041933
MSE: 699908     7.012129e-07
2873192    5.048006e-06
3926555    7.304533e-06
2138143    4.050564e-05
4078822    3.393248e-04
               ...     
3488887    4.385076e-06
187317     5.945536e-05
4971466    2.864385e-04
1045015    1.150137e-04
51766      6.885000e-07
Name: target, Length: 468728, dtype: float64


베이지안 최적화에 대한 이해

구글 드라이브에 모델 저장

In [None]:
# 전체 모델 저장
torch.save(model, '/content/drive/MyDrive/simple_model.pth')

# 또는 모델의 state_dict만 저장
torch.save(model.state_dict(), '/content/drive/MyDrive/simple_model_state_dict.pth')