In [1]:
# 関連ライブラリをインポート
import lightgbm as lgb
import numpy as np
#import os
import pandas as pd
import random
#import torch

# scikit-learn関連をインポート
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

# warningを非表示
import warnings
warnings.filterwarnings('ignore')

In [2]:
SEED = 42

# 参考: 乱数固定
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    #os.environ['PYTHONHASHSEED'] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.daterministic = True
seed_everything(SEED)

## データ読み込み

In [3]:
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print(X.shape)
print(y.shape)

(20640, 8)
(20640,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(X_train.shape) 
display(X_train.head(3))
print(y_train.shape) 
display(y_train.head(3))
print(X_test.shape) 
display(X_test.head(3))
print(y_test.shape) 
display(y_test.head(3))

(16512, 8)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03
1,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16
2,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48


(16512,)


0    1.030
1    3.821
2    1.726
dtype: float64

(4128, 8)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01
1,2.5313,30.0,5.039384,1.193493,1565.0,2.679795,35.14,-119.46
2,3.4801,52.0,3.977155,1.185877,1310.0,1.360332,37.8,-122.44


(4128,)


0    0.47700
1    0.45800
2    5.00001
dtype: float64

## 交差検証準備

In [5]:
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

## メインループ

In [12]:
y_preds = []
models = []
y_pred_oof = np.zeros((len(X_train),)) # 各分割でのoof(訓練に使用されなかったfold)に対する予測値
rmse_scores = []
categorical_features = [] # カテゴリ変数

# ハイパーパラメータ
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbosity': -1,
    'seed': SEED
}

In [13]:
# メインループ
for fold_id, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    # 表示
    print('-------------------')
    print(f'Fold: {fold_id}')

    # 訓練データと検証データに分割
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]
    
    # データセットを生成
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

    # 訓練
    model = lgb.train(params, lgb_train, 
                        valid_sets=[lgb_train, lgb_eval],
                        #num_boost_round=1000,
                        callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                      verbose=True),
                                    lgb.log_evaluation(period=10)])
    
    # 予測
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_oof[valid_index] = y_pred_val
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    # RMSEを計算
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

    # 結果を保存
    y_preds.append(y_pred)
    models.append(model)
    rmse_scores.append(rmse)


-------------------
Fold: 0
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 0.690893	valid_1's rmse: 0.716723
[20]	training's rmse: 0.550405	valid_1's rmse: 0.585947
[30]	training's rmse: 0.484253	valid_1's rmse: 0.52938
[40]	training's rmse: 0.452608	valid_1's rmse: 0.505973
[50]	training's rmse: 0.432136	valid_1's rmse: 0.495781
[60]	training's rmse: 0.417186	valid_1's rmse: 0.489883
[70]	training's rmse: 0.404101	valid_1's rmse: 0.485307
[80]	training's rmse: 0.393119	valid_1's rmse: 0.481115
[90]	training's rmse: 0.383744	valid_1's rmse: 0.47901
[100]	training's rmse: 0.375538	valid_1's rmse: 0.476297
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 0.375538	valid_1's rmse: 0.476297
-------------------
Fold: 1
Training until validation scores don't improve for 10 rounds
[10]	training's rmse: 0.700676	valid_1's rmse: 0.71159
[20]	training's rmse: 0.556384	valid_1's rmse: 0.57818
[30]	training's rmse: 0.491276	valid_1's rmse: 0

## 評価

In [10]:
print(y_pred_oof[:10])

[1.33857965 3.44364903 2.34178004 0.93449189 1.41572654 3.14414378
 1.54324824 4.68059735 1.9535667  2.90658437]


In [14]:
rmse_scores

[0.4762966660167601, 0.46730236799673963, 0.4783137514325842]

In [15]:
cv_score = sum(rmse_scores) / len(rmse_scores)
print('=== CV score ===')
print(cv_score)

=== CV score ===
0.47397092848202793


## テスト

In [19]:
y_preds

[array([0.53551911, 0.94839274, 5.12449949, ..., 5.05884548, 0.74876133,
        1.68696057]),
 array([0.57497302, 0.79578338, 5.03911956, ..., 4.93604868, 0.6230162 ,
        1.69786877]),
 array([0.58674261, 0.90330143, 4.73546777, ..., 4.90567227, 0.65916445,
        1.77708261])]

In [20]:
y_sub = sum(y_preds) / len(y_preds)
y_sub

array([0.56574492, 0.88249252, 4.96636227, ..., 4.96685548, 0.67698066,
       1.72063732])

In [21]:
rmse_sub = np.sqrt(mean_squared_error(y_test, y_sub))
print('=== RMSE for test data ===')
print(rmse_sub)

=== RMSE for test data ===
0.4622482797858079
