# LightGBM

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

import lightgbm as lgbm

In [2]:
df = pd.read_csv('../datasets/pseudodata_train.csv')
X = df.drop('E', axis=1)
y = df['E']

In [7]:

x_train,x_valid,y_train,y_valid = train_test_split(X,y,test_size = 0.2, random_state=4)

categorical_features = []

In [8]:
#LightGBM用の訓練データセットを作成
lgb_train = lgbm.Dataset(x_train,y_train,categorical_feature=categorical_features,free_raw_data=False)

#LightGBM用の評価データセットを作成
lgb_eval = lgbm.Dataset(x_valid,y_valid,reference=lgb_train,categorical_feature=categorical_features,free_raw_data=False)


#ハイパーパラメータを設定
params = {
    'boosting_type': 'gbdt',

    'objective': 'binary', #二値分類
    #'objective': 'regression', #回帰
    
    'metric':'auc', #AUCの最大化を目指す
    #'metric': {'l2', 'l1'}, 

    #'num_leaves': 50,
    #'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    #'vervose': 0
    
}

#LightGBMの学習
lgbm_model = lgbm.train(params,lgb_train,
                 valid_sets=[lgb_train,lgb_eval],
                 verbose_eval=10,
                 num_boost_round=1000,
                 early_stopping_rounds= 20)

#最もスコアの良いモデルの保存
optimum_boost_rounds = lgbm_model.best_iteration

[LightGBM] [Info] Number of positive: 38, number of negative: 42
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.475000 -> initscore=-0.100083
[LightGBM] [Info] Start training from score -0.100083
Training until validation scores don't improve for 20 rounds
[10]	training's auc: 0.955514	valid_1's auc: 0.97
[20]	training's auc: 0.972744	valid_1's auc: 0.99
[30]	training's auc: 0.973684	valid_1's auc: 1
[40]	training's auc: 0.984023	valid_1's auc: 1
Early stopping, best iteration is:
[27]	training's auc: 0.973684	valid_1's auc: 1


In [9]:
df = pd.read_csv('../datasets/pseudodata_test.csv')
x_test = df.drop('E', axis=1)
y_test = df['E']

In [10]:
#LightGBMの予測
y_proba = lgbm_model.predict(x_test, num_interation=lgbm_model.best_iteration)
y_pred = (y_proba > 0.5).astype(int)




In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

# F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1}')

# ROC-AUC (you need prediction probabilities for this, not just class predictions)
# Here we just reuse y_pred for simplicity
roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC-AUC: {roc_auc}')

Accuracy: 0.89
Precision: 0.898989898989899
Recall: 0.8811881188118812
F1 score: 0.89
ROC-AUC: 0.9605960596059606
