# Recommendation re-ranking using *XGB*

In [1]:
import xgboost as xgb
import numpy as np

In [2]:
gbm = xgb.XGBRegressor(objective="rank:pairwise")
# 정규분포 난수 추출(element 1000개인 n차원)
X = np.random.normal(0, 1, 1000).reshape(100,10)
y = np.random.randint(0,5,100) 
# 균일분포 정규 난수 생성(0~4까지 난수 100개)

In [3]:
X

array([[-1.44823773e+00, -6.39182447e-01, -6.30627581e-01,
         3.71276218e-01,  6.17066574e-01, -5.09403269e-01,
         3.11905601e-01, -1.21134103e+00,  6.78949386e-01,
        -2.84189009e-02],
       [ 1.15749190e+00,  5.73342479e-01,  1.37671734e-01,
         7.27839460e-03, -5.63736313e-01,  5.54640883e-01,
        -1.21020618e-03,  5.88176492e-01, -5.12605760e-01,
         1.20325149e+00],
       [-8.53947467e-01, -3.94894409e-01, -2.93040105e-01,
        -2.49452180e-01, -9.98634575e-01, -4.20076594e-02,
         8.64188509e-01, -1.12632228e+00,  4.91051421e-01,
         1.27113212e+00],
       [-1.45540572e+00, -1.16134778e+00,  1.03581565e+00,
        -5.80683150e-01, -2.97158372e+00,  2.54971641e-01,
         9.18364598e-01, -6.50475986e-01,  5.03376940e-01,
        -1.25394488e+00],
       [-1.14715964e+00,  2.72071152e+00, -1.42468603e+00,
         4.96246884e-01,  4.31476265e-02,  2.28196293e-01,
         6.99549043e-01, -7.43966900e-01, -2.23641324e-01,
        -8.

In [4]:
y # 난수 100개

array([3, 2, 2, 0, 1, 1, 1, 2, 1, 0, 2, 0, 0, 0, 1, 0, 4, 3, 1, 0, 4, 2,
       2, 0, 4, 0, 0, 0, 1, 4, 4, 2, 4, 0, 4, 2, 3, 2, 3, 1, 1, 4, 1, 4,
       1, 1, 3, 2, 3, 1, 3, 2, 0, 1, 1, 2, 2, 3, 0, 0, 3, 2, 0, 2, 4, 3,
       2, 4, 1, 4, 0, 4, 4, 3, 4, 0, 4, 1, 2, 3, 1, 3, 0, 4, 0, 3, 2, 1,
       4, 2, 3, 1, 1, 4, 0, 1, 2, 0, 3, 2])

In [5]:
gbm.fit(X, y)
pred = gbm.predict(X).tolist()

*****

In [6]:
import pandas as pd
import numpy as np
from xgboost import DMatrix, train

In [7]:
xgb_rank_params1 ={    
    'booster' : 'gbtree',
    'eta': 0.1,
    'gamma' : 1.0 ,
    'min_child_weight' : 0.1,
    'objective' : 'rank:pairwise',
    'eval_metric' : 'merror',
    'max_depth' : 6,
    'num_boost_round':10,
    'save_period' : 0 
}

xgb_rank_params2 = {
    'bst:max_depth':2, 
    'bst:eta':1, 'silent':1, 
    'objective':'rank:pairwise',
    'nthread':4,
    'eval_metric':'ndcg' ## ndcg !
}
  

In [8]:
# generate training dataset

n_group=2 # group 수
n_choice=5  # 각 group의 샘플 수
feature_dim = 4 # 4차원인 애들이 5개씩 두 그룹에 들어가있는거임

# 균일분포 생성(0~100)
dtrain=np.random.uniform(0,100,[n_group*n_choice,feature_dim])  # from, to, size(10*4)
print(dtrain) # 총 10개고, feature는 4차원(10*4) 

[[5.18245937e-03 5.94938035e+01 1.85793643e+01 5.12755404e+01]
 [8.67063167e+01 3.93389559e+01 5.29781284e+00 9.58519260e+01]
 [5.43421320e+01 8.34286121e+01 8.26542564e+00 7.54895132e+01]
 [8.55276497e+01 6.21378949e+00 6.83275397e+01 9.30750678e+01]
 [4.15033332e+01 9.14526809e+01 3.13502130e+01 1.29604805e+01]
 [4.25790104e+01 5.01926186e+01 7.55707948e+01 3.02879591e+01]
 [3.51333581e+01 7.39277154e+01 6.64250664e+01 8.49960410e+00]
 [2.16783088e+00 2.38907992e+01 8.48971935e+01 1.29834793e+00]
 [4.98442490e+01 8.30889715e+01 9.30418541e+01 5.95698157e+01]
 [8.90145487e+00 9.04442997e+01 9.24514236e+01 7.34515284e+01]]


In [9]:
# dtarget : 그룹별 순위 > 1차원으로 펼치기
dtarget=np.array([np.random.choice([0,1,2,4,5],5,False) for i in range(n_group)])

In [10]:
print(dtarget) # 순위 랜덤부여

[[4 1 5 0 2]
 [2 4 5 1 0]]


In [11]:
dtarget = dtarget.flatten()

In [12]:
dtarget # 1차원으로 펼치기

array([4, 1, 5, 0, 2, 2, 4, 5, 1, 0])

In [13]:
# dgroup : n_choice를 그룹별로 나열
dgroup = np.array([n_choice for i in range(n_group)])

In [14]:
dgroup

array([5, 5])

In [15]:
dgroup = dgroup.flatten() # 이것도 1차원으로 펼치기

In [16]:
dgroup

array([5, 5])

In [17]:
# dtrain : group별 차원 들어간 그냥 나열된 array
# dtarget : 그룹별 라벨(1차원으로 펼침)
xgbTrain = DMatrix(dtrain, label=dtarget)
print(xgbTrain)

<xgboost.core.DMatrix object at 0x000002587E7B29C8>


In [18]:
# group 설정
xgbTrain.set_group(dgroup)
print(xgbTrain)

<xgboost.core.DMatrix object at 0x000002587E7B29C8>


In [19]:
# eval 데이터 형성
dtrain_eval = np.random.uniform(0, 100, [n_group*n_choice, feature_dim])
xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget)

xgbTrain_eval.set_group(dgroup)
evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')]

In [20]:
evallist

[(<xgboost.core.DMatrix at 0x2587e7b29c8>, 'train'),
 (<xgboost.core.DMatrix at 0x2587e7c4748>, 'eval')]

In [21]:
xgb_rank_params3 = {
    'bst:max_depth':3, 
    'bst:eta':1, 'silent':1, 
    'objective':'rank:pairwise',
    'nthread':4,
    'eval_metric':'ndcg'
}

rankModel = train(xgb_rank_params3,xgbTrain,num_boost_round=20,evals=evallist)

[0]	train-ndcg:0.85589	eval-ndcg:0.675781
[1]	train-ndcg:0.925534	eval-ndcg:0.619886
[2]	train-ndcg:0.927085	eval-ndcg:0.559092
[3]	train-ndcg:0.92872	eval-ndcg:0.559116
[4]	train-ndcg:0.92872	eval-ndcg:0.546034
[5]	train-ndcg:0.930355	eval-ndcg:0.546034
[6]	train-ndcg:0.930355	eval-ndcg:0.546034
[7]	train-ndcg:0.930355	eval-ndcg:0.546034
[8]	train-ndcg:0.930355	eval-ndcg:0.534588
[9]	train-ndcg:0.930355	eval-ndcg:0.534588
[10]	train-ndcg:0.930355	eval-ndcg:0.534588
[11]	train-ndcg:0.930355	eval-ndcg:0.534588
[12]	train-ndcg:0.930355	eval-ndcg:0.534588
[13]	train-ndcg:0.930355	eval-ndcg:0.534588
[14]	train-ndcg:0.930355	eval-ndcg:0.534588
[15]	train-ndcg:0.930355	eval-ndcg:0.525882
[16]	train-ndcg:0.930355	eval-ndcg:0.525882
[17]	train-ndcg:0.930355	eval-ndcg:0.525882
[18]	train-ndcg:0.930355	eval-ndcg:0.525882
[19]	train-ndcg:0.930355	eval-ndcg:0.525882


In [22]:
dtarget

array([4, 1, 5, 0, 2, 2, 4, 5, 1, 0])

In [23]:
train_pred = rankModel.predict(xgbTrain)
train_pred

array([ 1.7354156, -0.65436  ,  1.2623098, -1.7718759,  0.1375234,
        0.6219508,  1.9347881,  2.3170996, -0.2562829, -1.1234348],
      dtype=float32)

In [24]:
pred1, pred2 = np.split(train_pred, 2, axis=0)

In [25]:
pred1

array([ 1.7354156, -0.65436  ,  1.2623098, -1.7718759,  0.1375234],
      dtype=float32)

In [26]:
target1, target2 = np.split(dtarget, 2, axis=0)

In [27]:
target1

array([4, 1, 5, 0, 2])

In [28]:
s = pred1.argsort() 
s

array([3, 1, 4, 2, 0], dtype=int64)

In [29]:
target1 = target1[s]
target1 # 오 예측 개잘하네? train이니까 바보야...당연한거아니냐?

array([0, 1, 2, 5, 4])

In [30]:
# test dataset
dtest = np.random.uniform(0, 100, [n_group*n_choice, feature_dim])
dtestgroup = np.array([n_choice for i in range(n_group)]).flatten()

In [31]:
dtest

array([[38.15548811, 76.6454024 , 60.23436805, 38.24816621],
       [93.05507232, 37.98874927, 37.43968921, 44.7032316 ],
       [33.19005346, 80.223466  , 58.37517573, 17.39639438],
       [38.07842284, 71.98580339, 94.98607841, 35.58107631],
       [69.10609653, 53.22597327, 53.60123785, 77.74045291],
       [64.93431008, 42.67971236, 24.89307559, 58.98668852],
       [54.13393138, 40.50496873, 30.92704902, 51.33134176],
       [92.63480046, 89.6914333 , 38.43712275, 24.98936772],
       [90.61155233, 52.30892738, 93.49361809, 19.33131169],
       [59.29844937, 74.62105912, 13.84243572, 17.4352382 ]])

In [32]:
dtestgroup

array([5, 5])

In [33]:
xgbTest = DMatrix(dtest)
xgbTest.set_group(dtestgroup)

In [34]:
xgbTest

<xgboost.core.DMatrix at 0x2587e7e2b48>

In [35]:
print(rankModel.predict(xgbTest))

[ 1.3657153  -0.5664927   1.1304033   1.3595874  -1.342095   -0.3272335
 -0.42728204  0.2972179   0.02341947  1.0797875 ]


In [36]:
song_meta = pd.read_json('song_meta.json', typ = 'frame',encoding='UTF-8')
train = pd.read_json('train.json', typ = 'frame',encoding='UTF-8')

In [37]:
train['num_songs'] = train['songs'].apply(len)

In [38]:
train['num_songs'].describe()

count    115071.000000
mean         45.935735
std          43.950335
min           1.000000
25%          19.000000
50%          30.000000
75%          54.000000
max         200.000000
Name: num_songs, dtype: float64

In [39]:
over100= train.query('num_songs>=100')

In [40]:
len(over100)

11759