
### How to use XGBoost in RecSys?
##### Hint: reranking and hybridization
Steps:

* Run your best algorithm and select a number of recommendations higher than the target cutoff, for example if you have to compute MAP@10, get 20 recommendations
*    Build a dataframe whose samples are the user-item recommendations
*    Add for each interaction some content features: item features, user features
*    Add for each interaction some features derived by other algorithms: CBF prediction, hybrid prediction
*    Add for each interaction other miscellaneous information: profile length, item popularity ..



In [1]:
#import section
from DataParser import DataParser
import numpy as np
import pandas as pd
import scipy.sparse as sp

from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import \
split_train_in_two_percentage_global_sample

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from Base.NonPersonalizedRecommender import TopPop
from SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender


import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

In [3]:
from Base.NonPersonalizedRecommender import TopPop, Random
from Hybrid.HybridCombinationSearch import HybridCombinationMergedSearch,HybridCombinationSearch
from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from KNN.UserKNNCFRecommender import UserKNNCFRecommender
from KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from MatrixFactorization.IALSRecommender import IALSRecommender
from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from SLIM_ElasticNet.SSLIM_ElasticNet import SSLIMElasticNet
from GraphBased.P3alphaRecommender import P3alphaRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender
from MatrixFactorization.Cython.MatrixFactorization_Cython import MatrixFactorization_BPR_Cython, MatrixFactorization_FunkSVD_Cython
from MatrixFactorization.PureSVDRecommender import PureSVDRecommender, PureSVDItemRecommender
from MatrixFactorization.NMFRecommender import NMFRecommender

In [2]:
seed = 1205
parser = DataParser()

In [4]:
URM_all = parser.get_URM_all()
ICM_all = parser.get_ICM_all()


URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.90, seed=seed)

evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])



##### HERE WE NEED OUR BEST ALGORITHM (THAT COULD/SHOULD BE AN HYBRID)

In [6]:
sslim = SSLIMElasticNet(URM_train, ICM_all, verbose=False)
sslim_params = {'beta': 0.4849594591575789, 'topK': 1000, 'l1_ratio': 1e-05, 'alpha': 0.001}
try:
    sslim.load_model(f'stored_recommenders/seed_1205_S-SLIMElasticNet/','for_notebook_analysis')
    print(f"{sslim.RECOMMENDER_NAME} loaded.")
except:
    print(f"Fitting {sslim.RECOMMENDER_NAME} ...")
    sslim.fit(**sslim_params)
    print(f"done.")
    sslim.save_model(f'stored_recommenders/seed_{str(seed)}_{sslim.RECOMMENDER_NAME}/','for_notebook_analysis')

S-SLIMElasticNetRecommender loaded.


In [7]:
ucf = UserKNNCFRecommender(URM_train, verbose=False)
ucf_params = {'topK': 190, 'shrink': 0, 'similarity': 'cosine', 'normalize': True}
try:
    ucf.load_model(f'stored_recommenders/seed_{str(seed)}_{ucf.RECOMMENDER_NAME}/','for_notebook_analysis')
    print(f"{ucf.RECOMMENDER_NAME} loaded.")
except:
    print(f"Fitting {ucf.RECOMMENDER_NAME} ...")
    ucf.fit(**ucf_params)
    print(f"done.")
    ucf.save_model(f'stored_recommenders/seed_{str(seed)}_{ucf.RECOMMENDER_NAME}/','for_notebook_analysis')

icb = ItemKNNCBFRecommender(URM_train, ICM_all, verbose=False)
icb_params = {'topK': 65, 'shrink': 0, 'similarity': 'dice', 'normalize': True}
try:
    icb.load_model(f'stored_recommenders/seed_{str(seed)}_{icb.RECOMMENDER_NAME}/','for_notebook_analysis')
    print(f"{icb.RECOMMENDER_NAME} loaded.")
except:
    print(f"Fitting {icb.RECOMMENDER_NAME} ...")
    icb.fit(**icb_params)
    print(f"done.")
    icb.save_model(f'stored_recommenders/seed_{str(seed)}_{icb.RECOMMENDER_NAME}/','for_notebook_analysis')

UserKNNCFRecommender loaded.
ItemKNNCBFRecommender loaded.


In [8]:
list_recommender = [sslim, icb, ucf]
best_recommender = HybridCombinationSearch(URM_train, ICM_all, list_recommender)
params={'alpha': 0.6461624491197696, 'l1_ratio': 0.7617220099582368}
best_recommender.fit(**params)

HybridCombinationSearch: URM Detected 27 (0.34 %) cold users.
HybridCombinationSearch: URM Detected 1634 (6.29 %) cold items.


Build the dataframe with the predictions

In [9]:
#user range???
user_ids = parser.get_ratings().user_id.unique()

In [13]:
cutoff = 20
user_recommendations_items = []
user_recommendations_user_id = []
target = []

for n_user in user_ids:
    
    recommendations = best_recommender.recommend(n_user, cutoff = 20)
    user_recommendations_items.extend(recommendations)
    user_recommendations_user_id.extend([n_user]*len(recommendations))


In [25]:
target=[]
for _ in user_ids:
    for _ in range(int(cutoff/2)):
        target.append(1)
    for _ in range(int(cutoff/2)):
        target.append(0)

In [19]:
len(user_recommendations_items)

158940

In [20]:
len(user_recommendations_user_id)

158940

In [27]:
len(target)

158940

In [22]:
len(user_ids)

7947

In [28]:
train_dataframe = pd.DataFrame({"user_id":user_recommendations_user_id, "item_id":user_recommendations_items, 'target':target})
train_dataframe

Unnamed: 0,user_id,item_id,target
0,0,1447,1
1,0,13219,1
2,0,25878,1
3,0,15830,1
4,0,12543,1
5,0,4927,1
6,0,23481,1
7,0,8544,1
8,0,7639,1
9,0,5161,1


In [30]:
train_dataframe.to_csv(r'xgboost-test.csv',index=False)

### addition of more feature
what kind of feature we can add?  
-> content features: item features, user features  
-> features derived by other algorithms: CBF prediction, hybrid prediction  
-> miscellaneous information: profile length, item popularity ..

In [31]:
from Base.NonPersonalizedRecommender import TopPop

topPop = TopPop(URM_train)
topPop.fit()


topPop_score_list = []

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    topPop_score = topPop._compute_item_score([user_id])[0,item_id]
    topPop_score_list.append(topPop_score)

TopPopRecommender: URM Detected 27 (0.34 %) cold users.
TopPopRecommender: URM Detected 1634 (6.29 %) cold items.


In [32]:
train_dataframe['item_popularity'] = pd.Series(topPop_score_list, index=train_dataframe.index)

In [34]:
cbf_rec = ItemKNNCBFRecommender(URM_train, ICM_all)
params ={'topK': 65, 'shrink': 0, 'similarity': 'dice', 'normalize': True}
cbf_rec.fit(**params)


cbf_score_list = []

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    cbf_score = cbf_rec._compute_item_score([user_id])[0,item_id]
    cbf_score_list.append(cbf_score)

ItemKNNCBFRecommender: URM Detected 27 (0.34 %) cold users.
ItemKNNCBFRecommender: URM Detected 1634 (6.29 %) cold items.
Similarity column 25975 ( 100 % ), 4853.83 column/sec, elapsed time 0.09 min


In [35]:
train_dataframe['cbf_score'] = pd.Series(cbf_score_list, index=train_dataframe.index)

In [36]:
user_profile_len = np.ediff1d(URM_train.indptr)

In [37]:
user_profile_len_list = []

target_feature = 1

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    user_profile_len_list.append(user_profile_len[user_id])

In [38]:
train_dataframe['user_profile_len'] = pd.Series(user_profile_len_list, index=train_dataframe.index)

In [39]:
train_dataframe

Unnamed: 0,user_id,item_id,target,item_popularity,cbf_score,user_profile_len
0,0,1447,1,18.0,0.000000,2
1,0,13219,1,19.0,0.105263,2
2,0,25878,1,16.0,0.096774,2
3,0,15830,1,34.0,0.000000,2
4,0,12543,1,17.0,0.000000,2
5,0,4927,1,48.0,0.000000,2
6,0,23481,1,58.0,0.000000,2
7,0,8544,1,82.0,0.000000,2
8,0,7639,1,38.0,0.000000,2
9,0,5161,1,6.0,0.000000,2


### TRAIN XGBoost
to rerank those prediction using as lable whether they should be recommended or not

In [41]:
!pip install xgboost
import xgboost as xgb

Collecting xgboost
  Using cached xgboost-1.2.1-py3-none-manylinux2010_x86_64.whl (148.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [43]:
import numpy as np

In [58]:
X_train1 = train_dataframe[['item_popularity', 'cbf_score', 'user_profile_len']].to_numpy()
y_train1 = train_dataframe[['target']].to_numpy()

In [60]:
X_train1

array([[18.        ,  0.        ,  2.        ],
       [19.        ,  0.10526316,  2.        ],
       [16.        ,  0.09677419,  2.        ],
       ...,
       [ 2.        ,  0.        ,  2.        ],
       [ 2.        ,  0.09375   ,  2.        ],
       [ 2.        ,  0.08571428,  2.        ]])

In [61]:
y_train1 = y_train1.squeeze()

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train1, y_train1, test_size=0.15, random_state=seed)

In [63]:
params = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # step for each iteration
    'silent': 1, # keep it quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 2, # the number of classes 
    'eval_metric': 'merror'} # evaluation metric 

num_round = 100  # the number of training iterations (number of trees)



In [64]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
model = xgb.train(params,
                  dtrain,
                  num_round,
                  verbose_eval=2,
                  evals=[(dtrain, 'train'), (dval, 'validation')],
                  early_stopping_rounds=20)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-merror:0.43056	validation-merror:0.43367
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 20 rounds.
[2]	train-merror:0.42237	validation-merror:0.42733
[4]	train-merror:0.41718	validation-merror:0.42007
[6]	train-merror:0.41542	validation-merror:0.42028
[8]	train-merror:0.41356	validation-merror:0.41706
[10]	train-merror:0.41329	validation-merror:0.41580
[12]	train-merror:0.41188	validation-merror:0.41609
[14]	train-merror:0.41160	validation-merror:0.41685
[16]	train-merror:0.41159	validation-merror:0.41613
[18]	train-merror:0.41052	validation-merror:0.41559
[20]	train-merror:0.40996	validation-merror:

In [94]:
dtrain1 = xgb.DMatrix(X_train1, label=y_train1)

preds = model.predict(dtrain1)
best_preds = np.asarray([np.argmax(line) for line in preds])
best_preds
main = train_dataframe[['user_id', 'item_id', 'target']].to_numpy()
final_frame=pd.DataFrame({'user_id': main[:,0], 'item_id': main[:,1], 'target': main[:,2], 'pred':best_preds})

In [95]:
final_frame

Unnamed: 0,user_id,item_id,target,pred
0,0,1447,1,0
1,0,13219,1,1
2,0,25878,1,1
3,0,15830,1,0
4,0,12543,1,0
5,0,4927,1,1
6,0,23481,1,1
7,0,8544,1,1
8,0,7639,1,1
9,0,5161,1,0


In [83]:
len(best_preds)

158940