In [1]:
# 関連ライブラリをインポート
import lightgbm as lgb
import numpy as np
#import os
import pandas as pd
import random
#import torch

# scikit-learn関連をインポート
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# warningを非表示
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 参考: 乱数固定
def seed_everything(seed=1234):
    random.seed(seed)
    np.random.seed(seed)
    #os.environ['PYTHONHASHSEED'] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.daterministic = True
seed_everything(1234)

## データ読み込み

In [3]:
# データ読み込み
train = pd.read_csv('../ignore_dir/input/train.csv')
test = pd.read_csv('../ignore_dir/input/test.csv')
gender_submission = pd.read_csv('../ignore_dir/input/gender_submission.csv')

## 前処理・特徴量エンジニアリング

In [4]:
# データを連結（前処理効率化のため）
data = pd.concat([train, test], sort=False)

# 前処理
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data['Age'].fillna(data['Age'].median(), inplace=True)
data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
data['IsAlone'] = 0
data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1

# 使用しない列を削除
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

# 訓練データとテストデータに戻す
train = data[:len(train)]
test = data[len(train):]

# 目的変数と説明変数に分割
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

# 表示
print(f"X_train: {X_train.shape}")
display(X_train.head())
print(f"y_train: {y_train.shape}")
display(y_train.head())
print(f"X_test: {X_test.shape}")
display(X_test.head())

X_train: (891, 9)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,22.0,1,0,7.25,0,2,0
1,1,1,38.0,1,0,71.2833,1,2,0
2,3,1,26.0,0,0,7.925,0,1,1
3,1,1,35.0,1,0,53.1,0,2,0
4,3,0,35.0,0,0,8.05,0,1,1


y_train: (891,)


0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

X_test: (418, 9)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,34.5,0,0,7.8292,2,1,1
1,3,1,47.0,1,0,7.0,0,2,0
2,2,0,62.0,0,0,9.6875,2,1,1
3,3,0,27.0,0,0,8.6625,0,1,1
4,3,1,22.0,1,1,12.2875,0,3,0


## 交差検証の準備

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# 目的変数が均等に分割できているか確認
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    print(f'fold_id: {fold_id}')
    print(f'y_tr y==1 rate: {sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1 rate: {sum(y_val)/len(y_val)}')
    print("--------------------------------")

fold_id: 0
y_tr y==1 rate: 0.38342696629213485
y_val y==1 rate: 0.3854748603351955
--------------------------------
fold_id: 1
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
--------------------------------
fold_id: 2
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
--------------------------------
fold_id: 3
y_tr y==1 rate: 0.38429172510518933
y_val y==1 rate: 0.38202247191011235
--------------------------------
fold_id: 4
y_tr y==1 rate: 0.38288920056100983
y_val y==1 rate: 0.38764044943820225
--------------------------------


## メインループ

In [6]:
y_preds = [] # 予測値を格納するリスト
models = [] # モデルを格納するリスト
oof_train = np.zeros((len(X_train),)) # 各分割でのoof(訓練に使用されなかったfold)に対する予測値
categorical_features = ['Embarked', 'Pclass', 'Sex'] # カテゴリ変数

# ハイパーパラメータ
params = {
    'objective': 'binary',
    'max_bin': 300,
    'learning_rate': 0.05,
    'num_leaves': 40
}

In [7]:
# メインループ
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    # 表示
    print('-------------------')
    print(f'Fold: {fold_id}')


    # 訓練データと検証データに分割
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    # データセットを生成
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

    # 訓練
    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      num_boost_round=1000,
                      callbacks=[lgb.early_stopping(stopping_rounds=10,
                                                    verbose=True),
                                 lgb.log_evaluation(10)])

    # 検証データでの予測
    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    # 予測値と学習済みモデルを格納
    y_preds.append(y_pred)
    models.append(model)

-------------------
Fold: 0
[LightGBM] [Info] Number of positive: 273, number of negative: 439
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.503721	valid_1's binary_logloss: 0.507892
[20]	training's binary_logloss: 0.426394	valid_1's binary_logloss: 0.439962
[30]	training's binary_logloss: 0.37914	valid_1's binary_logloss: 0.401837
[40]	training's binary_logloss: 0.34819	valid_1's binary_logloss: 0.389454
[50]	training's binary_logloss: 0.325355	valid_1's binary_logloss: 0.384696
[60]	training's binary_logloss: 0.304134	valid_1's binary_logloss: 0.381407
[70]	traini

## 評価

In [8]:
# oof_train(訓練に使用されなかったfoldに対する予測値)を保存・表示
pd.DataFrame(oof_train).to_csv('oof_train_skfold.csv', index=False)
print(oof_train[:10])

[0.10743915 0.9336148  0.24343427 0.97174844 0.23722008 0.05967737
 0.10523416 0.27028808 0.37330822 0.9413475 ]


In [9]:
# 各foldの検証データに対するスコア（logloss）を格納・表示
scores = [m.best_score['valid_1']['binary_logloss'] for m in models]
print(scores)

[0.38009408187107163, 0.37924619589793596, 0.40508420679400237, 0.4450699329180633, 0.4825556025529494]


In [10]:
# 各foldの検証データに対する予測値を平均して、最終的なスコアを生成
score = sum(scores) / len(scores)
print('===CV scores===')
print(score)

===CV scores===
0.41841000400680456


In [11]:
# oof_train(訓練に使用されなかったfoldに対する予測値)を0.5で二値化
y_pred_oof = (oof_train > 0.5).astype(int)
print(accuracy_score(y_train, y_pred_oof)) # 精度を計算

0.8294051627384961


## 提出

In [12]:
# 各foldでの「テストデータに対する予測値」の平均をとる
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int) # 閾値0.5で二値化
y_sub[:10]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [13]:
# 提出用ファイルを作成
sub = pd.read_csv('../ignore_dir/input/gender_submission.csv')
sub['Survived'] = y_sub
sub.to_csv('submission_lightgbm_skfold.csv', index=False)

sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
