In [5]:
# 必要なライブラリのインポート
import pandas as pd
import numpy as np

# モデル関連
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import cohen_kappa_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from scipy.optimize import minimize

# データの読み込み
train = pd.read_parquet("../data/input/train_preprocessed.parquet")
test = pd.read_parquet("../data/input/test_preprocessed.parquet")
sample = pd.read_csv("../data/input/sample_submission.csv")

# 訓練データとテストデータの準備
X = train.drop(['sii'], axis=1)
y = train['sii']
X_test = test

In [6]:

# モデルの定義
lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    random_state=42
)

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

cat_model = CatBoostRegressor(
    iterations=200,
    learning_rate=0.05,
    depth=6,
    random_state=42,
    verbose=0
)

# アンサンブルモデルの作成
ensemble_model = VotingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ],
    weights=[1, 1, 1]
)

# クロスバリデーションの設定
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# 予測値とスコアを格納するための配列を初期化
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
val_scores = []

# クロスバリデーションのループ
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Fold {fold + 1}")
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # モデルの訓練
    ensemble_model.fit(X_train, y_train)
    
    # 検証データでの予測
    val_pred = ensemble_model.predict(X_val)
    oof_predictions[val_index] = val_pred
    
    # テストデータでの予測
    test_pred = ensemble_model.predict(X_test)
    test_predictions += test_pred / n_splits
    
    # 検証スコアの計算
    val_pred_rounded = np.round(val_pred).astype(int)
    val_pred_rounded = np.clip(val_pred_rounded, y.min(), y.max())
    score = cohen_kappa_score(y_val, val_pred_rounded, weights='quadratic')
    val_scores.append(score)
    
    print(f"Validation QWK Score: {score}")

# 平均検証スコアの表示
mean_val_score = np.mean(val_scores)
print(f"Mean Validation QWK Score: {mean_val_score}")

# 最適な閾値の計算
def optimize_thresholds(y_true, y_pred):
    def loss_func(thresholds):
        y_pred_adj = np.digitize(y_pred, bins=thresholds)
        return -cohen_kappa_score(y_true, y_pred_adj, weights='quadratic')
    
    initial_thresholds = [0.5, 1.5, 2.5]
    result = minimize(loss_func, initial_thresholds, method='nelder-mead')
    return result.x

best_thresholds = optimize_thresholds(y, oof_predictions)
print(f"Optimized thresholds: {best_thresholds}")

# テストデータの予測値に閾値を適用
final_test_predictions = np.digitize(test_predictions, bins=best_thresholds)
final_test_predictions = np.clip(final_test_predictions, y.min(), y.max()).astype(int)


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23673
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 154
[LightGBM] [Info] Start training from score 0.587751
Validation QWK Score: 0.3641772421558781
Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23702
[LightGBM] [Info] Number of data points in the train set: 2189, number of used features: 154
[LightGBM] [Info] Start training from score 0.596619
Validation QWK Score: 0.36002585834228995
Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23723
[LightGBM] [I

In [7]:

# 提出用ファイルの作成
submission = pd.DataFrame({
    'id': sample['id'],
    'sii': final_test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' has been generated.")


Submission file 'submission.csv' has been generated.


In [8]:
submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0
