In [9]:
# 1. ライブラリ読み込み
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from lightgbm import early_stopping
from lightgbm import early_stopping, log_evaluation

import os
os.getcwd()
os.chdir("C:\\Users\\haiir\\Documents\\python_DIY\\Python_Learning\\lightgbm-app")


# 2. データ読み込み
df = pd.read_csv("data/sample_data.csv")  # パスは適宜調整
df.head()

# 3. 特徴量と目的変数の分割
X = df.drop(columns=["fim_out"])
y = df["fim_out"]

# 4. 訓練・検証データに分割
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Optuna 最適化関数の定義
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)



    gbm = lgb.train(
    params,
    dtrain,
    valid_sets=[dvalid],
    num_boost_round=1000,
    callbacks=[
        early_stopping(50),
        log_evaluation(0)  # ← 必要に応じて 100 などに変更してもOK
    ]
)

    preds = gbm.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    return rmse

# 6. Optuna スタディ実行（軽めに10試行）
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# 7. 結果の確認
print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

# 8. ベストパラメータでモデル再学習
best_params = study.best_params
best_params["objective"] = "regression"
best_params["metric"] = "rmse"

final_model = lgb.train(
    best_params,
    lgb.Dataset(X, label=y),
    num_boost_round=study.best_trial.number
)

# 9. モデルを保存
import pickle
with open("models/best_model.pkl", "wb") as f:
    pickle.dump(final_model, f)


[I 2025-07-09 01:38:56,119] A new study created in memory with name: no-name-64702bef-cfe5-44d6-b35d-99c13c646a12


Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:56,938] Trial 0 finished with value: 2.216043351102563 and parameters: {'learning_rate': 0.012640471492909783, 'num_leaves': 73, 'feature_fraction': 0.625277316903139, 'bagging_fraction': 0.8918250936066789, 'bagging_freq': 5, 'max_depth': 8}. Best is trial 0 with value: 2.216043351102563.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2.21604
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:57,318] Trial 1 finished with value: 5.80989312408945 and parameters: {'learning_rate': 0.0019369419267920476, 'num_leaves': 64, 'feature_fraction': 0.8784720192281417, 'bagging_fraction': 0.7240732259179166, 'bagging_freq': 5, 'max_depth': 4}. Best is trial 0 with value: 2.216043351102563.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 5.80989
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:57,727] Trial 2 finished with value: 1.885643647047301 and parameters: {'learning_rate': 0.01818391131889043, 'num_leaves': 54, 'feature_fraction': 0.7033177507950659, 'bagging_fraction': 0.7192730798405301, 'bagging_freq': 5, 'max_depth': 6}. Best is trial 2 with value: 1.885643647047301.


Early stopping, best iteration is:
[631]	valid_0's rmse: 1.88564
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:58,405] Trial 3 finished with value: 1.850920504352505 and parameters: {'learning_rate': 0.011388545137965646, 'num_leaves': 43, 'feature_fraction': 0.7979879620631398, 'bagging_fraction': 0.673186437452774, 'bagging_freq': 1, 'max_depth': 10}. Best is trial 3 with value: 1.850920504352505.


Early stopping, best iteration is:
[900]	valid_0's rmse: 1.85092
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:59,215] Trial 4 finished with value: 6.351979932859768 and parameters: {'learning_rate': 0.0016465945896678704, 'num_leaves': 62, 'feature_fraction': 0.7695092003845663, 'bagging_fraction': 0.7643353656660187, 'bagging_freq': 3, 'max_depth': 12}. Best is trial 3 with value: 1.850920504352505.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 6.35198
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:59,441] Trial 5 finished with value: 2.124843852327133 and parameters: {'learning_rate': 0.0613206766961608, 'num_leaves': 89, 'feature_fraction': 0.8703810840110188, 'bagging_fraction': 0.8591444554797121, 'bagging_freq': 2, 'max_depth': 12}. Best is trial 3 with value: 1.850920504352505.


Early stopping, best iteration is:
[210]	valid_0's rmse: 2.12484
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:38:59,661] Trial 6 finished with value: 1.8505645067481924 and parameters: {'learning_rate': 0.01263584456716943, 'num_leaves': 49, 'feature_fraction': 0.7024410998751933, 'bagging_fraction': 0.7376917714346204, 'bagging_freq': 4, 'max_depth': 3}. Best is trial 6 with value: 1.8505645067481924.


Did not meet early stopping. Best iteration is:
[994]	valid_0's rmse: 1.85056
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:39:00,198] Trial 7 finished with value: 6.057165632982477 and parameters: {'learning_rate': 0.001506020773133794, 'num_leaves': 80, 'feature_fraction': 0.9902534999384769, 'bagging_fraction': 0.6618126638036267, 'bagging_freq': 5, 'max_depth': 5}. Best is trial 6 with value: 1.8505645067481924.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 6.05717
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:39:00,432] Trial 8 finished with value: 2.1152025186676804 and parameters: {'learning_rate': 0.0544079771077837, 'num_leaves': 77, 'feature_fraction': 0.8313011268850988, 'bagging_fraction': 0.8384434470589537, 'bagging_freq': 2, 'max_depth': 9}. Best is trial 6 with value: 1.8505645067481924.


Early stopping, best iteration is:
[266]	valid_0's rmse: 2.1152
Training until validation scores don't improve for 50 rounds


[I 2025-07-09 01:39:00,991] Trial 9 finished with value: 1.8986805155049433 and parameters: {'learning_rate': 0.014422585551347348, 'num_leaves': 72, 'feature_fraction': 0.9526380576586582, 'bagging_fraction': 0.7551465339715365, 'bagging_freq': 4, 'max_depth': 9}. Best is trial 6 with value: 1.8505645067481924.


Early stopping, best iteration is:
[664]	valid_0's rmse: 1.89868
Best RMSE: 1.8505645067481924
Best params: {'learning_rate': 0.01263584456716943, 'num_leaves': 49, 'feature_fraction': 0.7024410998751933, 'bagging_fraction': 0.7376917714346204, 'bagging_freq': 4, 'max_depth': 3}


In [7]:
import os
os.getcwd()

'C:\\Users\\haiir\\Documents\\python_DIY\\Python_Learning\\lightgbm-app'

In [None]:
import argparse
import os
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import early_stopping, log_evaluation
import pickle

def load_data(data_path):
    df = pd.read_csv(data_path)
    df.columns = df.columns.str.strip().str.replace(r"[^\w]", "_", regex=True)
    X = df.drop(columns=["fim_out"])
    y = df["fim_out"]
    return train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial, X_train, y_train, X_valid, y_valid):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    gbm = lgb.train(
        params,
        dtrain,
        valid_sets=[dvalid],
        num_boost_round=1000,
        callbacks=[early_stopping(50), log_evaluation(0)]
    )

    preds = gbm.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)
    return rmse

def main(args):
    print(f"📂 Loading data from {args.data} ...")
    X_train, X_valid, y_train, y_valid = load_data(args.data)

    print(f"🔍 Running Optuna with {args.trials} trials...")
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=args.trials)

    print("✅ Best RMSE:", study.best_value)
    print("🏆 Best Parameters:", study.best_params)

    print("🚀 Training final model on all data...")
    df = pd.read_csv(args.data)
    df.columns = df.columns.str.strip().str.replace(r"[^\w]", "_", regex=True)
    X = df.drop(columns=["fim_out"])
    y = df["fim_out"]

    best_params = study.best_params
    best_params["objective"] = "regression"
    best_params["metric"] = "rmse"

    final_model = lgb.train(
        best_params,
        lgb.Dataset(X, label=y),
        num_boost_round=study.best_trial.number or 100
    )

    os.makedirs(os.path.dirname(args.output), exist_ok=True)
    with open(args.output, "wb") as f:
        pickle.dump(final_model, f)

    print(f"💾 Model saved to {args.output}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="LightGBM + Optuna Training Script")
    parser.add_argument("--data", type=str, required=True, help="Path to input CSV file")
    parser.add_argument("--trials", type=int, default=10, help="Number of Optuna trials")
    parser.add_argument("--output", type=str, required=True, help="Path to save the model file")
    args = parser.parse_args()
    main(args)

usage: ipykernel_launcher.py [-h] --data DATA [--trials TRIALS] --output
                             OUTPUT
ipykernel_launcher.py: error: the following arguments are required: --data, --output


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [10]:
import os
import pandas as pd

os.chdir("C:\\Users\\haiir\\Documents\\python_DIY\\Python_Learning\\lightgbm-app")
df = pd.read_csv("data/sample_data.csv")
df.columns = df.columns.str.strip().str.replace(r"[^\w]", "_", regex=True)
X = df[["fim_in", "age", "mmse", "paralysis"]]
y = df["fim_out"]