# XGBoost

XGBoostを使ったランキング学習
* https://xgboost.readthedocs.io/en/latest/python/examples/learning_to_rank.html#sphx-glr-python-examples-learning-to-rank-py

<a href="https://colab.research.google.com/github/fuyu-quant/data-science-wiki/blob/develop/tabledata/ranking/xgboost.ipynb" target="_blank" rel="noopener noreferrer"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install --upgrade xgboost

In [9]:
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

import xgboost as xgb

### データセットの作成
- 各データについて
    - qid:各データのグループ化する識別子(ユーザー情報など)，説明変数ではなくランキングをする際に参照する
    - Relevance Score:目的変数

In [19]:
X, y = make_classification(random_state = 3655)
rng = np.random.default_rng(seed = 3655)
n_query_groups = 3
qid = rng.integers(0, 3, size=X.shape[0])

sorted_idx = np.argsort(qid)
X = X[sorted_idx, :]
y = y[sorted_idx]
qid = qid[sorted_idx]

df = pd.DataFrame(X)
df['Relevance Score'] = y
df['qid'] = qid
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,Relevance Score,qid
0,1.290693,1.416972,0.311624,-0.321043,-0.440702,-0.823294,-1.673228,0.202391,-2.505758,-0.085083,...,1.069398,-1.515416,-0.257536,-2.283387,-1.010772,0.504195,1.271866,1.04298,1,0
1,0.374348,-1.13297,0.841416,-0.485837,-0.711883,-0.823499,-0.68193,-0.042279,1.187146,-1.514537,...,1.606523,-0.077827,2.009051,-0.511834,0.862724,0.130596,-0.958867,1.611975,0,0
2,-1.497527,0.858449,-1.617516,0.608672,0.026382,-0.353365,-0.859686,-0.934579,-0.134187,-1.05723,...,0.46018,-0.819763,-1.573229,2.137575,-0.362956,0.02141,-0.94834,-1.432481,1,0
3,-0.08734,0.00413,0.830821,0.601432,-0.04362,1.114204,-0.263136,1.457169,0.888081,1.171067,...,0.317865,-0.169299,0.238399,0.641452,0.958037,0.472043,-0.514248,0.502564,0,0
4,-0.364459,-0.958211,0.931423,-0.32329,0.728914,1.464944,0.00553,-1.345502,-0.71107,-0.693699,...,0.649086,0.305922,-1.161558,-1.677371,-1.116972,-1.352549,2.098834,-0.65626,0,0


### XGBoostのランキング学習

In [4]:
ranker = xgb.XGBRanker(
    tree_method="hist", 
    lambdarank_num_pair_per_sample=8, 
    objective="rank:ndcg", 
    lambdarank_pair_method="topk"
    )

ranker.fit(X, y, qid=qid)

### 予測

In [14]:
scores = ranker.predict(X)

sorted_idx = np.argsort(scores)[::-1]
scores = scores[sorted_idx]
scores

array([ 1.7109468 ,  1.7109468 ,  1.7109468 ,  1.7109468 ,  1.7109468 ,
        1.7109468 ,  1.5420871 ,  1.1923326 ,  1.1923326 ,  1.1923326 ,
        1.1923326 ,  1.1923326 ,  1.1923326 ,  1.1923326 ,  1.1923326 ,
        1.1406763 ,  1.1406763 ,  1.1406763 ,  1.1406763 ,  1.1406763 ,
        1.1406763 ,  1.1406763 ,  1.1406763 ,  0.97181636,  0.6220619 ,
        0.6220619 ,  0.6220619 ,  0.6220619 ,  0.6220619 ,  0.6220619 ,
        0.6220619 ,  0.6220619 ,  0.6220619 ,  0.6220619 ,  0.4532021 ,
        0.22649904, -0.79644126, -0.79644126, -0.79644126, -0.79644126,
       -0.8974627 , -0.8974627 , -0.8974627 , -0.8974627 , -0.98964775,
       -0.98964775, -1.0362681 , -1.0362681 , -1.0362681 , -1.0362681 ,
       -1.0362681 , -1.0362681 , -1.2228703 , -1.2228703 , -1.2228703 ,
       -1.2228703 , -1.2240134 , -1.2240134 , -1.2240134 , -1.2240134 ,
       -1.2240134 , -1.2240134 , -1.2240134 , -1.2240134 , -1.2294749 ,
       -1.2294749 , -1.2294749 , -1.2294749 , -1.2294749 , -1.22