# XGBoost

In [1]:
import xgboost as xgb

# XGBoostの関数

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate':0.01,
    'tree_method':'gpu_hist'
}

In [None]:
def fit_xgboost(X, y, params, folds, add_suffix=''):
    
    '''
    xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate':0.01,
    'tree_method':'gpu_hist'
    }
    '''
    
    oof_pred = np.zeros(len(y), dtype=np.float32)

    fold_unique = sorted(folds.unique())
    
    #foldsはバリデーションの出力
    for fold in fold_unique:
        idx_train = (folds!=fold)
        idx_valid = (folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        #XGBoostのデータセット
        xgb_train = xgb.DMatrix(x_train, label=y_train)
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)
        evals = [(xgb_train,'train'),(xgb_valid,'eval')]
        
        #XGBoostの学習
        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=100000,
            early_stopping_rounds=100,
            evals=evals,
            verbose_eval=100
        )
        
        #pickle.dumpにより，指定したファイルにオブジェクトを保存(pickle化)
        #open関数でファイルを作成，または上書き保存
        pickle.dump(model, open(f'xgb_fold{fold}{add_suffix}.pkl', 'wb'))
        
        #予測値の出力
        pred_i = model.predict(xgb.DMatrix(x_valid), ntree_limit=model.best_ntree_limit)
        
        oof_pred[x_valid.index] = pred_i
        
        #round関数で小数点の四捨五入
        #AUCの計算
        score = round(roc_auc_score(y_valid, pred_i), 5)
        print(f'Performance of the prediction: {score}\n')

    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    del model
    gc.collect()
    return oof_pred

# XGBoostの予測の関数

In [None]:
def pred_xgboost(X, data_dir: Path, add_suffix=''):
    #pickleファイルからモデルを読み込む
    models = glob(str(data_dir / f'xgb*{add_suffix}.pkl'))
    #非pickle化
    models = [pickle.load(open(model, 'rb')) for model in models]
    #各モデルでの出力値をリストに格納
    preds = np.array([model.predict(xgb.DMatrix(X), ntree_limit=model.best_ntree_limit) for model in models])
    #平均を取る
    preds = np.mean(preds, axis=0)
    #予測値の出力
    return preds