# UEC実践ソフトウェア開発基礎論2023-レポート課題2
田中久温

## 概要
元コンペ: https://www.kaggle.com/competitions/bnp-paribas-cardif-claims-management \
個人保険のデータセット．顧客から来た保険金の請求を速やかにチェックして承認 or 非承認しないといけない．\
データセットに含まれる請求には2つのカテゴリが含まれていて，どっちのカテゴリか判定したい．\
出力は確率，評価指標は logloss \
目的変数も説明変数も匿名で，特に説明無し．順序変数はないらしい．

## 手法
この notebook では，LightGBM を試す．\
XGBoost の次に実装する．

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/train.csv.zip")
test_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/test.csv.zip")
sample_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/sample_submission.csv.zip")

In [None]:
train_test_df = pd.concat([train_df, test_df])

## EDA: XGBoost と同じなので省略

In [None]:
categorical_columns = [c for c in train_test_df.columns if train_test_df[c].dtype == 'O'] # 先生のサンプルから
numeric_columns = [c for c in train_test_df.columns[2:] if c not in categorical_columns]

## 欠損値の処理
処理しなくても入力できるので，いったんなし．

## カテゴリ変数の変換

In [None]:
from sklearn.preprocessing import LabelEncoder

for c in categorical_columns:
    le = LabelEncoder()
    train_test_df[c] = le.fit_transform(train_test_df[c])

# 学習

In [None]:
use_cols = numeric_columns + categorical_columns

In [None]:
# データ分割
from sklearn.model_selection import train_test_split 

train_ft_df = train_test_df[train_test_df['target'].notna()][use_cols]
train_y_df = train_test_df[train_test_df['target'].notna()]['target'].astype(int)
test_ft_df = train_test_df[train_test_df['target'].isna()][use_cols]

train_X, valid_X, train_y, valid_y = train_test_split(train_ft_df, train_y_df, test_size=0.2, shuffle=True)

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(train_X, label=train_y, feature_name=use_cols, categorical_feature=categorical_columns)
valid_data = lgb.Dataset(valid_X, label=valid_y, feature_name=use_cols, categorical_feature=categorical_columns)

In [None]:
params = {
    'objective': 'binary',
}

In [None]:
n_rounds = 1000
clf = lgb.train(
    params, train_data, n_rounds,
    valid_names=['train', 'valid'], valid_sets=[train_data, valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(0)
    ]
)

In [None]:
pred_test = clf.predict(test_ft_df, feature_name=use_cols, categorical_feature=categorical_columns)
pred_test

In [None]:
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_lgb.csv", index=False)

public 0.474 で XGBoost より悪い．

欠損値処理も XGBoost のやつに合わせてみる．

## 特徴量作成2

In [None]:
train_test_df = pd.concat([train_df, test_df])

In [None]:
# 欠損値処理
# numeric
for c in numeric_columns:
    nan_ratio = train_test_df[c].isna().mean()

    if nan_ratio < 0.1:
        # nan が 10% 未満なら平均値で埋める．(あんまり考慮したくない)
        column_mean = train_test_df[c].mean()
        train_test_df[c] = train_test_df[c].fillna(column_mean)
    else:
        # nan が 10% 以上あれば -999 で埋める．(モデルが考慮できるようにする)
        train_test_df[c] = train_test_df[c].fillna(-999)
    
# categorical
for c in categorical_columns:
    nan_count = train_test_df[c].isna().sum()

    if nan_count <= 10:
        # nan が 10個以下なら，最頻値で埋める．(あんまり考慮したくない)
        train_test_df[c] = train_test_df[c].fillna(train_test_df[c].mode()[0])
    else:
        # nan が 10個より多いなら，カテゴリとして扱う．(モデルが考慮できるようにする)
        train_test_df[c] = train_test_df[c].fillna(train_test_df[c].mode()[0])

In [None]:
# ラベルエンコーディング
for c in categorical_columns:
    le = LabelEncoder()
    train_test_df[c] = le.fit_transform(train_test_df[c])

In [None]:
# データ分割
train_ft_df = train_test_df[train_test_df['target'].notna()][use_cols]
train_y_df = train_test_df[train_test_df['target'].notna()]['target'].astype(int)
test_ft_df = train_test_df[train_test_df['target'].isna()][use_cols]

train_X, valid_X, train_y, valid_y = train_test_split(train_ft_df, train_y_df, test_size=0.2, shuffle=True)

In [None]:
train_data = lgb.Dataset(train_X, label=train_y, feature_name=use_cols, categorical_feature=categorical_columns)
valid_data = lgb.Dataset(valid_X, label=valid_y, feature_name=use_cols, categorical_feature=categorical_columns)

In [None]:
params = {
    'objective': 'binary',
}
n_rounds = 1000
clf = lgb.train(
    params, train_data, n_rounds,
    valid_names=['train', 'valid'], valid_sets=[train_data, valid_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(0)
    ]
)

In [None]:
pred_test = clf.predict(test_ft_df, feature_name=use_cols, categorical_feature=categorical_columns)
pred_test

In [None]:
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_lgb.csv", index=False)

public 0.473 やっぱり XGBoost より悪い．

ハイパラチューニングする．

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
cv_params = {
    'learning_rate': [0.05, 0.1, 0.3],
    'min_data_in_leaf': [10, 20, 50, 100],
    'max_depth': [2, 4, 6, 8],
}

clf = lgb.LGBMClassifier(n_estimators=1000, objective='binary')

gridcv = GridSearchCV(
    clf, cv_params, cv=3,
    scoring='neg_log_loss', n_jobs=-1
)
gridcv.fit(
    train_X, train_y,
    categorical_feature=categorical_columns,
    eval_set = [(valid_X, valid_y)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        lgb.log_evaluation(0)
    ]
)

best_params = gridcv.best_params_
best_score = gridcv.best_score_
print(f'最適パラメータ {best_params}\nスコア {best_score}')

In [None]:
pred_test = gridcv.predict_proba(test_ft_df)[:, 1]
pred_test

In [None]:
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_xgb.csv", index=False)

public 0.475

結局，XGBoost に勝てず．