
### How to use XGBoost in RecSys?
##### Hint: reranking and hybridization
Steps:

* Run your best algorithm and select a number of recommendations higher than the target cutoff, for example if you have to compute MAP@10, get 20 recommendations
*    Build a dataframe whose samples are the user-item recommendations
*    Add for each interaction some content features: item features, user features
*    Add for each interaction some features derived by other algorithms: CBF prediction, hybrid prediction
*    Add for each interaction other miscellaneous information: profile length, item popularity ..



In [2]:
#import section
from DataParser import DataParser
import numpy as np
import pandas as pd
import scipy.sparse as sp

from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import \
split_train_in_two_percentage_global_sample

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender
from Base.NonPersonalizedRecommender import TopPop
from SLIM_ElasticNet.SLIMElasticNetRecommender import SLIMElasticNetRecommender
from GraphBased.RP3betaRecommender import RP3betaRecommender

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

In [3]:
seed = 1024
parser = DataParser()

In [6]:
URM_all = parser.get_URM_all()
ICM_all = parser.get_ICM_all()


URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.85, seed=seed)

evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])



##### HERE WE NEED OUR BEST ALGORITHM (THAT COULD/SHOULD BE AN HYBRID)

In [7]:
best_recommender = RP3betaRecommender(URM_train)
best_recommender.fit(alpha=0.47193263239089045,  beta=0.0316773658685341, topK=946, normalize_similarity=False)

RP3betaRecommender: URM Detected 42 (0.53 %) cold users.
RP3betaRecommender: URM Detected 1971 (7.59 %) cold items.


Build the dataframe with the predictions

In [10]:
#user range???
user_ids = parser.get_ratings().user_id.unique()

In [11]:
cutoff = 20
user_recommendations_items = []
user_recommendations_user_id = []

for n_user in user_ids:
    
    recommendations = best_recommender.recommend(n_user, cutoff = 20)
    user_recommendations_items.extend(recommendations)
    user_recommendations_user_id.extend([n_user]*len(recommendations))



In [12]:
train_dataframe = pd.DataFrame({"user_id":user_recommendations_user_id, "item_id":user_recommendations_items})
train_dataframe

Unnamed: 0,user_id,item_id
0,0,1447
1,0,4927
2,0,9851
3,0,5161
4,0,20869
5,0,6915
6,0,649
7,0,20761
8,0,10817
9,0,12543


### addition of more feature
what kind of feature we can add?  
-> content features: item features, user features  
-> features derived by other algorithms: CBF prediction, hybrid prediction  
-> miscellaneous information: profile length, item popularity ..

In [14]:
from Base.NonPersonalizedRecommender import TopPop

topPop = TopPop(URM_train)
topPop.fit()


topPop_score_list = []

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    topPop_score = topPop._compute_item_score([user_id])[0,item_id]
    topPop_score_list.append(topPop_score)

TopPopRecommender: URM Detected 42 (0.53 %) cold users.
TopPopRecommender: URM Detected 1971 (7.59 %) cold items.


In [16]:
train_dataframe['item_popularity'] = pd.Series(topPop_score_list, index=train_dataframe.index)

In [19]:
cbf_rec = ItemKNNCBFRecommender(URM_train, ICM_all)
cbf_rec.fit(topK=40, shrink=1000, similarity='cosine', feature_weighting='BM25')


cbf_score_list = []

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    cbf_score = cbf_rec._compute_item_score([user_id])[0,item_id]
    cbf_score_list.append(cbf_score)

ItemKNNCBFRecommender: URM Detected 42 (0.53 %) cold users.
ItemKNNCBFRecommender: URM Detected 1971 (7.59 %) cold items.
Similarity column 25975 ( 100 % ), 5520.09 column/sec, elapsed time 0.08 min


In [20]:
train_dataframe['cbf_score'] = pd.Series(cbf_score_list, index=train_dataframe.index)

In [22]:
user_profile_len = np.ediff1d(URM_train.indptr)

In [23]:
user_profile_len_list = []

target_feature = 1

for user_id, item_id in zip(user_recommendations_user_id, user_recommendations_items):
    
    user_profile_len_list.append(user_profile_len[user_id])

In [24]:
train_dataframe['user_profile_len'] = pd.Series(user_profile_len_list, index=train_dataframe.index)

In [25]:
train_dataframe

Unnamed: 0,user_id,item_id,item_popularity,cbf_score,user_profile_len
0,0,1447,19.0,0.000000,2
1,0,4927,49.0,0.000000,2
2,0,9851,97.0,0.000000,2
3,0,5161,6.0,0.000000,2
4,0,20869,5.0,0.000000,2
5,0,6915,19.0,0.000000,2
6,0,649,4.0,0.000000,2
7,0,20761,6.0,0.000000,2
8,0,10817,13.0,0.000000,2
9,0,12543,16.0,0.000000,2


### TRAIN XGBoost
to rerank those prediction using as lable whether they should be recommended or not

In [1]:
import xgboost as xgb