### 【#9】案B OOF + Isotonic + Sampling

In [1]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import os

# データフレーム読み込み
train_df = pd.read_csv("/kaggle/input/playground-series-s5e9/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e9/test.csv")

# データ結合
all_df = pd.concat([train_df,test_df],axis=0,ignore_index=True)
max_row = len(all_df)

In [2]:
# 数値列とカテゴリ列を取得
num_col = []
cat_col = []

train_df2 = train_df.drop(["id"],axis=1)

for col in train_df2.columns:
    if train_df[col].dtypes!="object":
        num_col.append(col)
    else:
        cat_col.append(col)

### 【Light GBM】

In [3]:
###################################################
##### Light GBM + OOF Isotonic + Sampling #########
###################################################
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.isotonic import IsotonicRegression

# ===== パラメータ =====
n_class = 20
SEED = 42
gamma = 0.07   # train分布混合率

lgbm_params = {
    'objective': 'multiclass',
    'num_class': n_class,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 64,
    'verbose': -1,
    'device': 'cpu',
}

# ===== bin化して中央値を保存 =====
train_df["bin"] = pd.cut(train_df["BeatsPerMinute"], bins=n_class, labels=False)
bin_to_median = train_df.groupby("bin")["BeatsPerMinute"].median().to_dict()

# ===== 入力データ =====
X = train_df.drop(["id", "BeatsPerMinute", "bin"], axis=1)
y = train_df["bin"]
X_test = test_df.drop("id", axis=1)

# ===== OOF用バッファ =====
oof_proba = np.zeros((len(X), n_class))
test_proba_all = np.zeros((len(X_test), n_class))

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# ===== CVループ =====
for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
    print(f"### Fold {fold+1}")
    X_train, y_train = X.iloc[tr_idx], y.iloc[tr_idx]
    X_valid, y_valid = X.iloc[va_idx], y.iloc[va_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)

    model = lgb.train(
        lgbm_params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(100),
        ],
    )

    # OOF確率
    oof_proba[va_idx] = model.predict(X_valid, num_iteration=model.best_iteration)

    # テスト確率
    test_proba_all += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

# ===== Isotonic変換 =====
isos = []
for c in range(n_class):
    iso = IsotonicRegression(out_of_bounds="clip")
    y_bin = (y.values == c).astype(int)
    iso.fit(oof_proba[:, c], y_bin)
    isos.append(iso)

test_proba_corrected = np.zeros_like(test_proba_all)
for c, iso in enumerate(isos):
    test_proba_corrected[:, c] = iso.transform(test_proba_all[:, c])

# 正規化
test_proba_corrected = test_proba_corrected.clip(1e-12, None)
test_proba_corrected /= test_proba_corrected.sum(axis=1, keepdims=True)

# ===== argmax→中央値 =====
pred_class = test_proba_corrected.argmax(axis=1)
pred_bpm_mapped = np.array([bin_to_median[int(c)] for c in pred_class])

# ===== train分布を混合 =====
np.random.seed(SEED)
sampled = np.random.choice(train_df["BeatsPerMinute"].values, size=len(pred_bpm_mapped), replace=True)
final_pred = (1 - gamma) * pred_bpm_mapped + gamma * sampled

# ===== 提出 =====
# submission = pd.DataFrame({"id": test_df["id"], "BeatsPerMinute": final_pred})
# submission.to_csv("submission_isomap_mix.csv", index=False)
# print("submission_isomap_mix.csv saved.")


### Fold 1
[100]	train's multi_logloss: 2.58525	valid's multi_logloss: 2.61872
### Fold 2
[100]	train's multi_logloss: 2.58505	valid's multi_logloss: 2.61957
### Fold 3
[100]	train's multi_logloss: 2.58398	valid's multi_logloss: 2.62148
### Fold 4
[100]	train's multi_logloss: 2.58495	valid's multi_logloss: 2.61849
### Fold 5
[100]	train's multi_logloss: 2.58555	valid's multi_logloss: 2.61722


In [4]:
# 提出データ作成
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e9/sample_submission.csv")

sample_submission['BeatsPerMinute'] = final_pred
sample_submission.to_csv('submission.csv', index=False)
print('Submission file saved.')

Submission file saved.


In [5]:
# # 訓練データのスコア
# # from sklearn.metrics import f1_score
# from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt

# # pred_rescaled = pred_lgb * y_std + y_mean

# # RMSE_lgb = np.sqrt(mean_squared_error(y_train,pred_rescaled))
# RMSE_lgb = np.sqrt(mean_squared_error(y,pred_lgb))
# print(f"LGB: RMSE score = {RMSE_lgb}")

# # 学習曲線
# # lgb.plot_metric(evals_result_lgb,title="LightGBM RMSE",)

In [6]:
# # 提出データ
# ensemble_test = (pred_lgb_test + preds_test) / 2.0

# # 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e9/sample_submission.csv")

# sample_submission['BeatsPerMinute'] = ensemble_test
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')

In [7]:
# 提出データ作成
# sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e9/sample_submission.csv")

# sample_submission['BeatsPerMinute'] = pred_lgb_test
# sample_submission.to_csv('submission.csv', index=False)
# print('Submission file saved.')