<a href="https://colab.research.google.com/github/dAn-solution/competition/blob/main/Prob_kiva_016.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Kiva／クラウドファンディングの資金調達額予測
- GBDT（勾配ブースティング木 を実施

### Google Driveのマウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'

Mounted at /content/drive
/content/drive/My Drive


### ライブラリのインストール、インポート

In [None]:
# カレントディレクトリを変更
import os
os.chdir('/content/drive/My Drive/Probdata/kiva/')
print(os.getcwd())

/content/drive/My Drive/Probdata/kiva


In [None]:
class Config():
    root_path = './'
    input_path = os.path.join(root_path, 'input')
    output_path = os.path.join(root_path, 'output')
    result_path = os.path.join(root_path, 'result')
    bert_model_name = 'bert-base-uncased'
    seed = 42
    debug = False

In [None]:
import pandas as pd
import numpy as np
import datetime
import warnings

import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

### データの読み込み

In [None]:
train_df = pd.read_csv(f'{Config.input_path}/train.csv')
test_df = pd.read_csv(f'{Config.input_path}/test.csv')
sample_df = pd.read_csv(f'{Config.input_path}/sample_submission.csv')

train_tags_pca = np.load(f'{Config.result_path}/train_tags_pca.npy')
test_tags_pca = np.load(f'{Config.result_path}/test_tags_pca.npy')
train_bert_pca = np.load(f'{Config.result_path}/train_bert_pca.npy')
test_bert_pca = np.load(f'{Config.result_path}/test_bert_pca.npy')
train_bert_018_pca = np.load(f'{Config.result_path}/train_bert_018_pca.npy')
test_bert_018_pca = np.load(f'{Config.result_path}/test_bert_018_pca.npy')

train_X_pca = np.load(f'{Config.result_path}/train_X_pca.npy')
test_X_pca = np.load(f'{Config.result_path}/test_X_pca.npy')

## GBDT（勾配ブースティング木）

In [None]:
# # trainデータを目的変数と説明変数に分割
# train_columns = ["ORIGINAL_LANGUAGE", "ACTIVITY_NAME", "SECTOR_NAME", "COUNTRY_CODE","CURRENCY_POLICY",
#        "CURRENCY","REPAYMENT_INTERVAL", "DISTRIBUTION_MODEL"]
target_column = "LOAN_AMOUNT"

y = train_df[target_column]
# X = train_df[train_columns]
# test_X = test_df[train_columns]

In [None]:
train_pca = np.concatenate([train_X_pca, train_tags_pca, train_bert_pca, train_bert_018_pca], axis=1)
test_pca = np.concatenate([test_X_pca, test_tags_pca, test_bert_pca, test_bert_018_pca], axis=1)

# X = pd.concat([X, pd.DataFrame(train_pca)], axis=1)
# test_X = pd.concat([test_X, pd.DataFrame(test_pca)], axis=1)
X = pd.DataFrame(train_pca)
test_X = pd.DataFrame(test_pca)
print(X.shape)
print(test_X.shape)

(91333, 18)
(91822, 18)


In [None]:
# trainデータを訓練用とテスト用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = Config.seed)

In [None]:
# 学習/予測

y_oof = np.zeros(len(y_train))
meta_preds = []
y_preds = []
kf = StratifiedKFold(n_splits=5)
for fold, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
    x_tr_fold = X.iloc[tr_idx]
    y_tr_fold = y.iloc[tr_idx]
    x_vl_fold = X.iloc[vl_idx]
    y_vl_fold = y.iloc[vl_idx]

    model = GradientBoostingRegressor()
    model.fit(x_tr_fold, y_tr_fold)
    y_oof[vl_idx] = model.predict(x_vl_fold)
    meta_preds.append(model.predict(X))
    y_preds.append(model.predict(test_X))
    print(
        f'fold {fold} score:', mean_absolute_error(y_oof[vl_idx], y_vl_fold)
    )

print('X score:', mean_absolute_error(model.predict(X), y))



fold 0 score: 415.2923276311962
fold 1 score: 401.9372392136452
fold 2 score: 393.6854717308769
fold 3 score: 406.61001707428454
fold 4 score: 411.03315985867505
X score: 402.45418291847034


In [None]:
# メタモデルへの引き渡し
train_016_df = pd.DataFrame(np.mean(meta_preds, axis=0))
train_016_df.to_csv(os.path.join(Config.result_path, "train_016.csv"), index=False)


test_016_df = pd.DataFrame(np.mean(y_preds, axis=0))
test_016_df.to_csv(os.path.join(Config.result_path, "test_016.csv"), index=False)

### 提出データの作成

In [None]:
sample_df["LOAN_AMOUNT"] = np.mean(y_preds, axis=0)
print(sample_df)

       LOAN_ID  LOAN_AMOUNT
0      2041445   222.302455
1      1944435  1005.340629
2      2083354   626.858837
3      1993565   509.742378
4      2064272   194.177299
...        ...          ...
91817  1993862   458.409999
91818  2015070   328.913609
91819  1950349   389.232059
91820  1921580  1282.064429
91821  1976733   718.144139

[91822 rows x 2 columns]


In [None]:
# submitファイルのファイル名に利用する作成時刻の取得

now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=1)))
date_time = "{0:%Y%m%d_%H%M}".format(now)

In [None]:
# submitファイルの出力(Google Driveに出力）
# 形式：submit_yyyymmdd_hhmm.csv

sample_df.to_csv(os.path.join(Config.output_path, f"submit_{date_time}.csv"), index=False)
