# UEC実践ソフトウェア開発基礎論2023-レポート課題2
田中久温

## 概要
元コンペ: https://www.kaggle.com/competitions/bnp-paribas-cardif-claims-management \
個人保険のデータセット．顧客から来た保険金の請求を速やかにチェックして承認 or 非承認しないといけない．\
データセットに含まれる請求には2つのカテゴリが含まれていて，どっちのカテゴリか判定したい．\
出力は確率，評価指標は logloss \
目的変数も説明変数も匿名で，特に説明無し．順序変数はないらしい．

## 手法
この notebook では，CatBoost を試す．\
LightGBM の次に実装する．

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/train.csv.zip")
test_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/test.csv.zip")
sample_df = pd.read_csv("../input/bnp-paribas-cardif-claims-management/sample_submission.csv.zip")

In [3]:
train_test_df = pd.concat([train_df, test_df])

In [4]:
categorical_columns = [c for c in train_test_df.columns if train_test_df[c].dtype == 'O'] # 先生のサンプルから
numeric_columns = [c for c in train_test_df.columns[2:] if c not in categorical_columns]

## カテゴリ変数の変換
欠損値処理等とくにせずにやる

In [5]:
from sklearn.preprocessing import LabelEncoder

for c in categorical_columns:
    le = LabelEncoder()
    train_test_df[c] = le.fit_transform(train_test_df[c])



## 学習

In [6]:
use_cols = numeric_columns + categorical_columns

In [7]:
# データ分割
from sklearn.model_selection import train_test_split 

train_ft_df = train_test_df[train_test_df['target'].notna()][use_cols]
train_y_df = train_test_df[train_test_df['target'].notna()]['target'].astype(int)
test_ft_df = train_test_df[train_test_df['target'].isna()][use_cols]

train_X, valid_X, train_y, valid_y = train_test_split(train_ft_df, train_y_df, test_size=0.2, shuffle=True)

In [8]:
from catboost import CatBoostClassifier, Pool

In [9]:
clf = CatBoostClassifier(iterations=1000, loss_function='Logloss')

In [10]:
clf.fit(train_X, train_y, eval_set=(valid_X, valid_y), early_stopping_rounds=10, verbose_eval=False)

<catboost.core.CatBoostClassifier at 0x7833e1c47520>

In [11]:
pred_test = clf.predict_proba(test_ft_df)[:, 1]

In [12]:
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_cat.csv", index=False)

public 0.469

### cat features 指定してみる

In [13]:
clf = CatBoostClassifier(iterations=1000, loss_function='Logloss', cat_features=categorical_columns)

In [14]:
clf.fit(train_X, train_y, eval_set=(valid_X, valid_y), early_stopping_rounds=10, logging_level='Verbose')

Learning rate set to 0.096678
0:	learn: 0.6518990	test: 0.6515668	best: 0.6515668 (0)	total: 415ms	remaining: 6m 54s
1:	learn: 0.6180077	test: 0.6174302	best: 0.6174302 (1)	total: 809ms	remaining: 6m 43s
2:	learn: 0.5904154	test: 0.5898452	best: 0.5898452 (2)	total: 1.16s	remaining: 6m 26s
3:	learn: 0.5685905	test: 0.5680161	best: 0.5680161 (3)	total: 1.46s	remaining: 6m 3s
4:	learn: 0.5521475	test: 0.5518286	best: 0.5518286 (4)	total: 1.77s	remaining: 5m 51s
5:	learn: 0.5375618	test: 0.5371716	best: 0.5371716 (5)	total: 2.1s	remaining: 5m 48s
6:	learn: 0.5274062	test: 0.5269240	best: 0.5269240 (6)	total: 2.45s	remaining: 5m 47s
7:	learn: 0.5180816	test: 0.5175125	best: 0.5175125 (7)	total: 2.78s	remaining: 5m 45s
8:	learn: 0.5110858	test: 0.5104440	best: 0.5104440 (8)	total: 3.14s	remaining: 5m 46s
9:	learn: 0.5057954	test: 0.5049984	best: 0.5049984 (9)	total: 3.46s	remaining: 5m 42s
10:	learn: 0.5008016	test: 0.4999388	best: 0.4999388 (10)	total: 3.79s	remaining: 5m 40s
11:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7833e3641330>

In [15]:
pred_test = clf.predict_proba(test_ft_df)[:, 1]
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_cat.csv", index=False)

public 0.4489 めちゃくちゃ良い

これ以上良くなるとは思えないけど、一応 XGBoost にやった欠損値処理を入れてやってみる

## XGBoost でやった欠損値処理入れる

In [16]:
train_test_df = pd.concat([train_df, test_df])

In [17]:
# numeric
for c in numeric_columns:
    nan_ratio = train_test_df[c].isna().mean()

    if nan_ratio < 0.1:
        # nan が 10% 未満なら平均値で埋める．(あんまり考慮したくない)
        column_mean = train_test_df[c].mean()
        train_test_df[c] = train_test_df[c].fillna(column_mean)
    else:
        # nan が 10% 以上あれば -999 で埋める．(モデルが考慮できるようにする)
        train_test_df[c] = train_test_df[c].fillna(-999)
    
# categorical
for c in categorical_columns:
    nan_count = train_test_df[c].isna().sum()

    if nan_count <= 10:
        # nan が 10個以下なら，最頻値で埋める．(あんまり考慮したくない)
        train_test_df[c] = train_test_df[c].fillna(train_test_df[c].mode()[0])
    else:
        # nan が 10個より多いなら，カテゴリとして扱う．(モデルが考慮できるようにする)
        train_test_df[c] = train_test_df[c].fillna(train_test_df[c].mode()[0])

In [18]:
for c in categorical_columns:
    le = LabelEncoder()
    train_test_df[c] = le.fit_transform(train_test_df[c])

In [19]:
train_ft_df = train_test_df[train_test_df['target'].notna()][use_cols]
train_y_df = train_test_df[train_test_df['target'].notna()]['target'].astype(int)
test_ft_df = train_test_df[train_test_df['target'].isna()][use_cols]

train_X, valid_X, train_y, valid_y = train_test_split(train_ft_df, train_y_df, test_size=0.2, shuffle=True)

In [20]:
clf = CatBoostClassifier(iterations=1000, loss_function='Logloss', cat_features=categorical_columns)

In [21]:
clf.fit(train_X, train_y, eval_set=(valid_X, valid_y), early_stopping_rounds=10, logging_level='Verbose')

Learning rate set to 0.096678
0:	learn: 0.6501422	test: 0.6498061	best: 0.6498061 (0)	total: 383ms	remaining: 6m 22s
1:	learn: 0.6169190	test: 0.6162776	best: 0.6162776 (1)	total: 735ms	remaining: 6m 6s
2:	learn: 0.5876432	test: 0.5869233	best: 0.5869233 (2)	total: 1.1s	remaining: 6m 4s
3:	learn: 0.5676105	test: 0.5668022	best: 0.5668022 (3)	total: 1.48s	remaining: 6m 7s
4:	learn: 0.5511861	test: 0.5502456	best: 0.5502456 (4)	total: 1.83s	remaining: 6m 3s
5:	learn: 0.5392388	test: 0.5382665	best: 0.5382665 (5)	total: 2.19s	remaining: 6m 3s
6:	learn: 0.5284741	test: 0.5274406	best: 0.5274406 (6)	total: 2.54s	remaining: 6m
7:	learn: 0.5197446	test: 0.5186236	best: 0.5186236 (7)	total: 2.88s	remaining: 5m 57s
8:	learn: 0.5115953	test: 0.5105434	best: 0.5105434 (8)	total: 3.24s	remaining: 5m 57s
9:	learn: 0.5060063	test: 0.5047548	best: 0.5047548 (9)	total: 3.57s	remaining: 5m 53s
10:	learn: 0.5010119	test: 0.4998579	best: 0.4998579 (10)	total: 3.91s	remaining: 5m 51s
11:	learn: 0.4964612	

<catboost.core.CatBoostClassifier at 0x7833e199f9a0>

In [22]:
pred_test = clf.predict_proba(test_ft_df)[:, 1]
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_cat.csv", index=False)

public 0.452 で悪化．

欠損値を埋めたりするときに，情報が損失してるせい．

catboost の前処理が強すぎて，自作の前処理いらない．入れるとしても情報を増やすような前処理じゃないといけなさそう．

## 最後にハイパラチューニング版
欠損値埋めなしで，ハイパラチューニングする．

grid search したら時間がめちゃくちゃかかって終わらない上，そんなに精度も上がらなさそうだったので断念．

提出用にもう一回だけ学習する

In [23]:
train_test_df = pd.concat([train_df, test_df])
for c in categorical_columns:
    le = LabelEncoder()
    train_test_df[c] = le.fit_transform(train_test_df[c])

train_ft_df = train_test_df[train_test_df['target'].notna()][use_cols]
train_y_df = train_test_df[train_test_df['target'].notna()]['target'].astype(int)
test_ft_df = train_test_df[train_test_df['target'].isna()][use_cols]

train_X, valid_X, train_y, valid_y = train_test_split(train_ft_df, train_y_df, test_size=0.2, shuffle=True)

In [24]:
clf = CatBoostClassifier(iterations=1000, loss_function='Logloss', cat_features=categorical_columns)
clf.fit(train_X, train_y, eval_set=(valid_X, valid_y), early_stopping_rounds=10, logging_level='Verbose')

Learning rate set to 0.096678
0:	learn: 0.6501632	test: 0.6498034	best: 0.6498034 (0)	total: 369ms	remaining: 6m 8s
1:	learn: 0.6169758	test: 0.6163594	best: 0.6163594 (1)	total: 715ms	remaining: 5m 56s
2:	learn: 0.5907769	test: 0.5899961	best: 0.5899961 (2)	total: 1.08s	remaining: 5m 59s
3:	learn: 0.5687475	test: 0.5677544	best: 0.5677544 (3)	total: 1.42s	remaining: 5m 53s
4:	learn: 0.5514032	test: 0.5502908	best: 0.5502908 (4)	total: 1.81s	remaining: 6m 1s
5:	learn: 0.5374860	test: 0.5360518	best: 0.5360518 (5)	total: 2.3s	remaining: 6m 21s
6:	learn: 0.5267675	test: 0.5253494	best: 0.5253494 (6)	total: 2.8s	remaining: 6m 37s
7:	learn: 0.5187120	test: 0.5172005	best: 0.5172005 (7)	total: 3.23s	remaining: 6m 40s
8:	learn: 0.5112822	test: 0.5096483	best: 0.5096483 (8)	total: 3.58s	remaining: 6m 34s
9:	learn: 0.5056024	test: 0.5038960	best: 0.5038960 (9)	total: 3.85s	remaining: 6m 21s
10:	learn: 0.5008571	test: 0.4990828	best: 0.4990828 (10)	total: 4.18s	remaining: 6m 16s
11:	learn: 0.49

<catboost.core.CatBoostClassifier at 0x7833e1b50160>

In [25]:
pred_test = clf.predict_proba(test_ft_df)[:, 1]
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_cat.csv", index=False)

In [26]:
# from sklearn.model_selection import GridSearchCV

In [27]:
'''
cv_params = {
    'depth': [8, 16, 32],
    'min_data_in_leaf': [1, 10],
    # 9時間経っても終わらなかったので，パラメータ減らした
}

clf = CatBoostClassifier(iterations=1000, loss_function='Logloss', cat_features=categorical_columns)

gridcv = GridSearchCV(
    clf, cv_params, cv=3,
    scoring='neg_log_loss', n_jobs=-1
)
gridcv.fit(
    train_X, train_y,
    eval_set = (valid_X, valid_y),
    early_stopping_rounds=10, logging_level='Verbose'
)

best_params = gridcv.best_params_
best_score = gridcv.best_score_
print(f'最適パラメータ {best_params}\nスコア {best_score}')
'''

"\ncv_params = {\n    'depth': [8, 16, 32],\n    'min_data_in_leaf': [1, 10],\n    # 9時間経っても終わらなかったので，パラメータ減らした\n}\n\nclf = CatBoostClassifier(iterations=1000, loss_function='Logloss', cat_features=categorical_columns)\n\ngridcv = GridSearchCV(\n    clf, cv_params, cv=3,\n    scoring='neg_log_loss', n_jobs=-1\n)\ngridcv.fit(\n    train_X, train_y,\n    eval_set = (valid_X, valid_y),\n    early_stopping_rounds=10, logging_level='Verbose'\n)\n\nbest_params = gridcv.best_params_\nbest_score = gridcv.best_score_\nprint(f'最適パラメータ {best_params}\nスコア {best_score}')\n"

In [28]:
'''
pred_test = gridcv.predict_proba(test_ft_df)[:, 1]
sub_df = pd.DataFrame({
    "ID": test_df["ID"], 
    "PredictedProb": pred_test
}).to_csv("tanaka_sub_cat.csv", index=False)
'''

'\npred_test = gridcv.predict_proba(test_ft_df)[:, 1]\nsub_df = pd.DataFrame({\n    "ID": test_df["ID"], \n    "PredictedProb": pred_test\n}).to_csv("tanaka_sub_cat.csv", index=False)\n'