In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm import tqdm

#----Recommenders----
from SLIM.SLIM_BPR_Python import SLIM_BPR_Python
from SLIM.SlimElasticNet import SLIMElasticNetRecommender
from cf.item_cf2 import ItemBasedCollaborativeFiltering
from cf.user_cf import UserBasedCollaborativeFiltering
from MF.ALS import AlternatingLeastSquare
from cbf.cbf import ContentBasedFiltering
from SlimBPR.SlimBPRRec import SlimBPRRec
from SlimBPR.SlimBPR import SlimBPR
#---------------

from sklearn.model_selection import train_test_split

**Dataset loading with pandas**

The function read_csv from pandas provides a wonderful and fast interface to load tabular data like this. For better results and performance we provide the separator ::, the column names ["user_id", "item_id", "ratings", "timestamp"], and the types of each attribute in the dtype parameter.

In [3]:
def load_data():
  return pd.read_csv("./data_train.csv")

In [4]:
ratings=load_data()
d ={'user_id': ratings['row'],'item_id':ratings['col'],'ratings':ratings['data']}
ratings=pd.DataFrame(data=d)

In [5]:
ratings.dtypes

user_id      int64
item_id      int64
ratings    float64
dtype: object

In [6]:
userList=list(d['user_id'])
itemList=list(d['item_id'])
ratingList=list(d['ratings'])

In [7]:
URM = sp.coo_matrix((ratingList,(userList,itemList)))
URM = URM.tocsr()

In [8]:
URM

<7947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 113268 stored elements in Compressed Sparse Row format>

In [9]:
def load_data_ICM():
  return pd.read_csv("./data_ICM_title_abstract.csv")

In [10]:
features=load_data_ICM()
d ={'item_id': features['row'],'feature_id':features['col'],'value':features['data']}
features=pd.DataFrame(data=d)
itemList=list(d['item_id'])

In [11]:
featureList=list(d['feature_id'])

In [12]:
valueList=list(d['value'])
ICM = sp.coo_matrix((valueList,(itemList,featureList)))
ICM = ICM.tocsr()

In [13]:
ICM

<25975x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 490691 stored elements in Compressed Sparse Row format>

In [14]:
num_users=URM.shape[0]

In [15]:
num_items=URM.shape[1]

In [16]:
num_users,num_items

(7947, 25975)

**Dataset splitting into train,validation and test**

This is the last part before creating the recommender. However, this step is super important, as it is the base for the training, parameters optimization, and evaluation of the recommender(s).

In here we read the ratings (which we loaded and preprocessed before) and create the train, validation, and test User-Rating Matrices (URM). It's important that these are disjoint to avoid information leakage from the train into the validation/test set, in our case, we are safe to use the train_test_split function from scikit-learn as the dataset only contains one datapoint for every (user,item) pair. On another topic, we first create the test set and then we create the validation by splitting again the train set.

train_test_split takes an array (or several arrays) and divides it into train and test according to a given size (in our case testing_percentage and validation_percentage, which need to be a float between 0 and 1).

After we have our different splits, we create the sparse URMs by using the csr_matrix function from scipy.




In [17]:
def dataset_splits(ratings, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1234
    
    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.ratings,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)
    
    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              ratings_training,
                                                              test_size=validation_percentage,
                                                             )
    
    urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)), 
                              shape=(num_users, num_items))
    
    
    urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)), 
                              shape=(num_users, num_items))
    
    urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)), 
                              shape=(num_users, num_items))
    

    
    return urm_train, urm_validation, urm_test



In [None]:
URM_train, URM_validation, URM_test = dataset_splits(ratings, 
                                                     num_users, 
                                                     num_items, 
                                                     validation_percentage=0.10, 
                                                     testing_percentage=0.15)

In [None]:
URM_train_validation = URM_train + URM_validation

In [None]:
URM_train_validation

In [18]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Notebooks_utils.data_splitter import train_test_holdout

urm_train_validation, urm_test = train_test_holdout(URM, train_perc = 0.85)
urm_train, urm_validation = train_test_holdout(urm_train_validation, train_perc = 0.85)

evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10])

In [19]:
from skopt.space import Real, Integer, Categorical

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["knn"] = Integer(5, 1000)
hyperparameters_range_dictionary["shrink"] = Integer(0, 1000)
hyperparameters_range_dictionary["similarity"] = Categorical(["cosine"])

In [20]:
from ParameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

recommender_class = ItemBasedCollaborativeFiltering

parameterSearch = SearchBayesianSkopt(recommender_class,
                                 evaluator_validation=evaluator_validation,
                                 evaluator_test=evaluator_test)

In [21]:
from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
  
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [urm_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [22]:
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [urm_train_validation],     # For a CBF model simply put [URM_train_validation, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {}
)

In [23]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"

In [24]:
parameterSearch.search(recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       parameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path,
                       output_file_name_root = recommender_class.RECOMMENDER_NAME,
                       metric_to_optimize = metric_to_optimize,
                      )

Iteration No: 1 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'knn': 906, 'shrink': 596, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2174.56 column/sec, elapsed time 0.20 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 3.13 sec. Users per second: 1477
SearchBayesianSkopt: New best config found. Config 0: {'knn': 906, 'shrink': 596, 'similarity': 'cosine'} - results: ROC_AUC: 0.1036527, PRECISION: 0.0207351, PRECISION_RECALL_MIN_DEN: 0.0893407, RECALL: 0.0878838, MAP: 0.0380978, MRR: 0.0749986, NDCG: 0.0581518, F1: 0.0335537, HIT_RATE: 0.2073514, ARHR: 0.0821898, NOVELTY: 0.0046803, AVERAGE_POPULARITY: 0.1371222, DIVERSITY_MEAN_INTER_LIST: 0.9698614, DIVERSITY_HERFINDAHL: 0.9969652, COVERAGE_ITEM: 0.3577286, COVERAGE_ITEM_CORRECT: 0.0193263, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0987794, DIVERSITY_GINI: 0.0979485, SHANNON_ENTROPY: 10.5834203, 

EvaluatorHoldout: Processed 5033 ( 100.00% ) in 3.20 sec. Users per second

EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.89 sec. Users per second: 1600
SearchBayesianSkopt: Config 6 is suboptimal. Config: {'knn': 921, 'shrink': 512, 'similarity': 'cosine'} - results: ROC_AUC: 0.1041619, PRECISION: 0.0208000, PRECISION_RECALL_MIN_DEN: 0.0893479, RECALL: 0.0878425, MAP: 0.0382200, MRR: 0.0750897, NDCG: 0.0582486, F1: 0.0336355, HIT_RATE: 0.2080000, ARHR: 0.0823736, NOVELTY: 0.0046820, AVERAGE_POPULARITY: 0.1362314, DIVERSITY_MEAN_INTER_LIST: 0.9702527, DIVERSITY_HERFINDAHL: 0.9970043, COVERAGE_ITEM: 0.3586526, COVERAGE_ITEM_CORRECT: 0.0194033, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0989052, DIVERSITY_GINI: 0.0982307, SHANNON_ENTROPY: 10.5919234, 

Iteration No: 7 ended. Evaluation done at random point.
Time taken: 15.7280
Function value obtained: -0.0382
Current minimum: -0.0385
Iteration No: 8 started. Evaluating function at random point.
SearchBayesianSkopt: Testing config: {'knn': 109, 'shrink': 672, 'similarity': 'cosine'}
Similarity column 

EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.41 sec. Users per second: 1915
SearchBayesianSkopt: New best config found. Config 13: {'knn': 98, 'shrink': 364, 'similarity': 'cosine'} - results: ROC_AUC: 0.1074082, PRECISION: 0.0212757, PRECISION_RECALL_MIN_DEN: 0.0889103, RECALL: 0.0874063, MAP: 0.0402684, MRR: 0.0765140, NDCG: 0.0598839, F1: 0.0342215, HIT_RATE: 0.2127568, ARHR: 0.0848347, NOVELTY: 0.0047958, AVERAGE_POPULARITY: 0.1158799, DIVERSITY_MEAN_INTER_LIST: 0.9788308, DIVERSITY_HERFINDAHL: 0.9978619, COVERAGE_ITEM: 0.3926853, COVERAGE_ITEM_CORRECT: 0.0212897, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.1004153, DIVERSITY_GINI: 0.1207335, SHANNON_ENTROPY: 11.0352535, 

EvaluatorHoldout: Processed 5033 ( 100.00% ) in 2.73 sec. Users per second: 1840
SearchBayesianSkopt: Best config evaluated with evaluator_test. Config: {'knn': 98, 'shrink': 364, 'similarity': 'cosine'} - results:
CUTOFF: 10 - ROC_AUC: 0.1181330, PRECISION: 0.0229883, PRECISION_RECALL_MIN_DEN: 0.0882



SearchBayesianSkopt: Testing config: {'knn': 5, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 3856.03 column/sec, elapsed time 0.11 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.13 sec. Users per second: 2172
SearchBayesianSkopt: Config 18 is suboptimal. Config: {'knn': 5, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.0920431, PRECISION: 0.0166486, PRECISION_RECALL_MIN_DEN: 0.0727929, RECALL: 0.0716476, MAP: 0.0335909, MRR: 0.0653182, NDCG: 0.0496981, F1: 0.0270189, HIT_RATE: 0.1664865, ARHR: 0.0704033, NOVELTY: 0.0048544, AVERAGE_POPULARITY: 0.0824199, DIVERSITY_MEAN_INTER_LIST: 0.9636730, DIVERSITY_HERFINDAHL: 0.9963465, COVERAGE_ITEM: 0.3462560, COVERAGE_ITEM_CORRECT: 0.0181328, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0835535, DIVERSITY_GINI: 0.1034933, SHANNON_ENTROPY: 10.7533644, 

Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 9.1670
Function value obtained: -0.0336
Current minimum: -


Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 15.0882
Function value obtained: -0.0296
Current minimum: -0.0403
Iteration No: 27 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 3031.34 column/sec, elapsed time 0.14 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.82 sec. Users per second: 1639
SearchBayesianSkopt: Config 26 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER



SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2948.53 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.94 sec. Users per second: 1573
SearchBayesianSkopt: Config 27 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0984019, DIVERSITY_GINI: 0.0972295, SHANNON_ENTROPY: 10.5496369, 

Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 15.5042
Function value obtained: -0.0380
Current min



SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2979.44 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.94 sec. Users per second: 1573
SearchBayesianSkopt: Config 28 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0984019, DIVERSITY_GINI: 0.0972295, SHANNON_ENTROPY: 10.5496369, 

Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 15.3535
Function value obtained: -0.0380
Current min



SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2869.92 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.81 sec. Users per second: 1647
SearchBayesianSkopt: Config 29 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0984019, DIVERSITY_GINI: 0.0972295, SHANNON_ENTROPY: 10.5496369, 

Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 15.5253
Function value obtained: -0.0380
Current min



SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2912.33 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.76 sec. Users per second: 1677
SearchBayesianSkopt: Config 32 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0984019, DIVERSITY_GINI: 0.0972295, SHANNON_ENTROPY: 10.5496369, 

Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 15.3859
Function value obtained: -0.0380
Current min



SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2975.20 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.87 sec. Users per second: 1614
SearchBayesianSkopt: Config 33 is suboptimal. Config: {'knn': 1000, 'shrink': 1000, 'similarity': 'cosine'} - results: ROC_AUC: 0.1035725, PRECISION: 0.0205622, PRECISION_RECALL_MIN_DEN: 0.0884486, RECALL: 0.0869381, MAP: 0.0379696, MRR: 0.0744895, NDCG: 0.0577570, F1: 0.0332583, HIT_RATE: 0.2056216, ARHR: 0.0815138, NOVELTY: 0.0046718, AVERAGE_POPULARITY: 0.1396544, DIVERSITY_MEAN_INTER_LIST: 0.9686369, DIVERSITY_HERFINDAHL: 0.9968427, COVERAGE_ITEM: 0.3572666, COVERAGE_ITEM_CORRECT: 0.0189028, COVERAGE_USER: 0.5819806, COVERAGE_USER_CORRECT: 0.0984019, DIVERSITY_GINI: 0.0972295, SHANNON_ENTROPY: 10.5496369, 

Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 15.3376
Function value obtained: -0.0380
Current min


Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 15.2908
Function value obtained: -0.0278
Current minimum: -0.0403
Iteration No: 42 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'knn': 998, 'shrink': 312, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2987.38 column/sec, elapsed time 0.14 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.80 sec. Users per second: 1653
SearchBayesianSkopt: Config 41 is suboptimal. Config: {'knn': 998, 'shrink': 312, 'similarity': 'cosine'} - results: ROC_AUC: 0.1043325, PRECISION: 0.0209297, PRECISION_RECALL_MIN_DEN: 0.0896236, RECALL: 0.0880608, MAP: 0.0381846, MRR: 0.0747181, NDCG: 0.0582275, F1: 0.0338211, HIT_RATE: 0.2092973, ARHR: 0.0820534, NOVELTY: 0.0046873, AVERAGE_POPULARITY: 0.1328731, DIVERSITY_MEAN_INTER_LIST: 0.9716174, DIVERSITY_HERFINDAHL: 0.9971407, COVERAGE_ITEM: 0.3575746, COVERAGE_ITEM_CORRECT: 0.0195188, COVERAGE_USER: 0.5819806, COVERAGE_USER_COR


Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 15.5712
Function value obtained: -0.0380
Current minimum: -0.0403
Iteration No: 50 started. Searching for the next optimal point.
SearchBayesianSkopt: Testing config: {'knn': 1000, 'shrink': 466, 'similarity': 'cosine'}
Similarity column 25975 ( 100 % ), 2958.78 column/sec, elapsed time 0.15 min
EvaluatorHoldout: Processed 4625 ( 100.00% ) in 2.79 sec. Users per second: 1656
SearchBayesianSkopt: Config 49 is suboptimal. Config: {'knn': 1000, 'shrink': 466, 'similarity': 'cosine'} - results: ROC_AUC: 0.1039946, PRECISION: 0.0207351, PRECISION_RECALL_MIN_DEN: 0.0890792, RECALL: 0.0875450, MAP: 0.0382083, MRR: 0.0747751, NDCG: 0.0581097, F1: 0.0335289, HIT_RATE: 0.2073514, ARHR: 0.0818898, NOVELTY: 0.0046788, AVERAGE_POPULARITY: 0.1364083, DIVERSITY_MEAN_INTER_LIST: 0.9701090, DIVERSITY_HERFINDAHL: 0.9969899, COVERAGE_ITEM: 0.3575746, COVERAGE_ITEM_CORRECT: 0.0192108, COVERAGE_USER: 0.5819806, COVERAGE_USER_C

AttributeError: 'ItemBasedCollaborativeFiltering' object has no attribute 'save_model'

**Evaluation Metrics**

In [25]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant) / relevant_items.shape[0]
    
    return recall_score
    
    
def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

**Evaluation Procedure**

The evaluation procedure returns the averaged accuracy scores (in terms of precision, recall and MAP) for all users (that have at least 1 rating in the test set). It also calculates the number of evaluated and skipped users. It receives a recommender instance, and the train and test URMs.

In [26]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0
    
    num_users = urm_train.shape[0]
    
    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]
        
        relevant_items = urm_test.indices[user_profile_start:user_profile_end]
        
        if relevant_items.size == 0:
            num_users_skipped += 1
            continue
            
        recommendations = recommender.recommend(user_id_array=user_id,
                                               cutoff=recommendation_length,
                                               remove_seen_flag=True
                                               )
        
        recommendations = np.array(recommendations)
        
        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)
        
        num_users_evaluated += 1
        
    
    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)
    
    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

In [29]:
#Params for the recommenders
cbf_param = {
    "knn": 140,
    "shrink": 0
}

user_cf_param = {
    "knn": 98,
    "shrink": 364
}

item_cf_param = {
    "knn": 211,
    "shrink": 145
}

slim_bpr_param = {
    "learning_rate" : 0.05,
    "epochs": 10,
    "nnz" : 1,
    "knn": 200
}

als_param = {
    "n_factors": 300,
    "regularization": 0.15,
    "iterations": 30
}


In [30]:
userCF = UserBasedCollaborativeFiltering(knn=user_cf_param["knn"], shrink=user_cf_param["shrink"])

In [31]:
userCF.fit(urm_train)

Similarity column 7947 ( 100 % ), 2553.13 column/sec, elapsed time 0.05 min


In [32]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(userCF, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

TypeError: recommend() got an unexpected keyword argument 'user_id_array'

In [34]:
itemCF = ItemBasedCollaborativeFiltering(urm_train)

In [35]:
itemCF.fit(knn=item_cf_param["knn"], shrink=item_cf_param["shrink"], similarity="cosine")

Similarity column 25975 ( 100 % ), 2352.77 column/sec, elapsed time 0.18 min


In [None]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(itemCF, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

In [36]:
SlimBpr=SlimBPRRec(learning_rate=slim_bpr_param["learning_rate"], epochs=slim_bpr_param["epochs"], nnz=slim_bpr_param["nnz"], knn=slim_bpr_param["knn"])

In [37]:
SlimBpr.fit(urm_train)

Get S SLIM BPR...
Epoch: 0


100%|██████████████████████████████████████████████████████████████████████████| 81911/81911 [00:51<00:00, 1576.35it/s]


Epoch: 1


 19%|██████████████▏                                                           | 15682/81911 [00:10<00:45, 1462.18it/s]

KeyboardInterrupt: 

In [33]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(SlimBpr, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

NameError: name 'SlimBpr' is not defined

In [None]:
slim_elastic = SLIMElasticNetRecommender()

In [None]:
slim_elastic.fit(urm_train)

In [None]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(slim_elastic, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

In [38]:
cbf = ContentBasedFiltering(knn=cbf_param["knn"],shrink=cbf_param["shrink"])

In [39]:
cbf.fit(urm_train,ICM)

 19%|██████████████▏                                                           | 15682/81911 [00:30<00:45, 1462.18it/s]

Similarity column 25975 ( 100 % ), 1344.74 column/sec, elapsed time 0.32 min


In [40]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(cbf, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

TypeError: recommend() got an unexpected keyword argument 'user_id_array'

In [41]:
ALS = AlternatingLeastSquare(n_factors=als_param["n_factors"], regularization=als_param["regularization"],iterations=als_param["iterations"])

In [42]:
ALS.fit(urm_train)

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))

In [None]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(ALS, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

**Hybrid**

In [57]:
class HybridRecommender(object):
    def __init__(self, URM, ICM):
        
        self.urm = URM
        
        self.ICM = ICM

#         self.userCF = UserBasedCollaborativeFiltering(URM.copy())

        self.itemCF = ItemBasedCollaborativeFiltering(URM.copy())
        
#         self.cbf = ContentBasedFiltering(URM.copy(), ICM.copy())
        
#         self.slim_random = SLIM_BPR_Python(URM.copy())
        
#         self.slim_elastic = SLIMElasticNetRecommender()
        
#         self.ALS = AlternatingLeastSquare(URM.copy())
    

    def fit(self, w, user_cf_param, item_cf_param, cbf_param, slim_param, als_param,):
        self.URM = URM

        ### SUB-FITTING ###
        print("Fitting user cf...")
#         self.userCF.fit(knn=user_cf_param["knn"], shrink=user_cf_param["shrink"])

        print("Fitting item cf...")
        self.itemCF.fit(knn=item_cf_param["knn"], shrink=item_cf_param["shrink"], similarity="cosine")
        
        print("Fitting cbf...")
#         self.cbf.fit(knn=cbf_param["knn"],shrink=cbf_param["shrink"])
        
        print("Fitting slim bpr...")
#         self.slim_random.fit(topK=slim_param["topK"],epochs=slim_param["epochs"])
        
        print("Fitting slim elastic...")
#         self.slim_elastic.fit(URM.copy())
        
        print("Fitting ALS...")
#         self.ALS.fit(n_factors=als_param["n_factors"], regularization=als_param["regularization"],iterations=als_param["iterations"])


    def recommend(self,user_id,urm_train: sp.csr_matrix,at=10):
     
#           self.userCF_ratings = self.userCF.get_expected_ratings(user_id)
#         self.itemCF_ratings = self.itemCF.get_expected_ratings(user_id)
#         self.cbf_ratings = self.cbf.get_expected_ratings(user_id)
#         self.slim_ratings = self.slim_random.get_expected_ratings(user_id)
#         self.slim_elastic_ratings = self.slim_elastic.get_expected_ratings(user_id)
#         self.ALS_ratings = self.ALS.get_expected_ratings(user_id)

        self.hybrid_ratings = None 

#         self.hybrid_ratings = self.userCF_ratings * w["user_cf"]
        self.hybrid_ratings = self.itemCF_ratings * w["item_cf"]
#         self.hybrid_ratings += self.cbf_ratings * w_right["cbf"]
#         self.hybrid_ratings += self.slim_ratings * w["slim"]
#         self.hybrid_ratings += self.ALS_ratings * w["als"]
#         self.hybrid_ratings += self.slim_elastic_ratings * w["elastic"]

        recommended_items = np.flip(np.argsort(self.hybrid_ratings), 0)

        # REMOVING SEEN
        unseen_items_mask = np.in1d(recommended_items,urm_train[user_id].indices,
                                    assume_unique=True, invert=True)
        recommended_items = recommended_items[unseen_items_mask]

        return recommended_items[0:at]

w = {
    "user_cf": 0,
    "item_cf": 1,
    "cbf": 0,
    "icm_svd": 0,
    "als": 0,
    "slim": 0,
    "elastic": 0
}


In [58]:
recommender = HybridRecommender(urm_train, ICM)

In [59]:
recommender.fit(w, user_cf_param=user_cf_param,item_cf_param=item_cf_param,cbf_param=cbf_param,slim_param=slim_bpr_param,als_param=als_param)

Fitting user cf...
Fitting item cf...
Similarity column 25975 ( 100 % ), 2299.55 column/sec, elapsed time 0.19 min
Fitting cbf...
Fitting slim bpr...
Fitting slim elastic...
Fitting ALS...


In [60]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(recommender, 
                                                                                            urm_train_validation, 
                                                                                            urm_test)
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

TypeError: recommend() got an unexpected keyword argument 'user_id_array'

**Submission to competition**

This step serves as a similar step that you will perform when preparing a submission to the competition. Specially after you have chosen and trained your recommender.

For this step the best suggestion is to select the most-performing configuration obtained in the hyperparameter tuning step and to train the recommender using both the train and validation set. Remember that in the competition you do not have access to the test set.

Another consideration is that, due to easier and faster calculations, we replaced the user/item identifiers with new ones in the preprocessing step. For the competition, you are required to generate recommendations using the dataset's original identifiers. Due to this, this step also reverts back the newer identifiers with the ones originally found in the dataset.

Last, this step creates a function that writes the recommendations for each user in the same file in a tabular format following this format:

csv
<user_id>,<item_id_1> <item_id_2> <item_id_3> <item_id_4> <item_id_5> <item_id_6> <item_id_7> <item_id_8> <item_id_9> <item_id_10>
Always verify the competitions' submission file model as it might vary from the one we presented here.

In [61]:
def load_goodguys():
  return pd.read_csv("./data_target_users_test.csv")
goodguys=load_goodguys()

In [62]:
goodguys

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [63]:
users_to_recommend = np.random.choice(goodguys.user_id,size=goodguys.size, replace=False)
users_to_recommend

array([3409, 3111, 2660, ..., 2590, 4286,  107], dtype=int64)

In [None]:
mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))

In [None]:
mapping_to_item_id

In [91]:
def prepare_submission(ratings: pd.DataFrame, users_to_recommend: np.array, urm_train: sp.csr_matrix, recommender: object):
    """
    users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id", "mapped_user_id"]].drop_duplicates()
    items_ids_and_mappings = ratings[["item_id", "mapped_item_id"]].drop_duplicates()
    
    mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))
    
    
    recommendation_length = 10
    submission = []
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id
        
        recommendations = recommender.recommend(user_id=mapped_user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length,
                                                #remove_seen=True)
                                               )
        
        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))
        
    return submission
"""
    
    
    recommendation_length = 10
    submission = []
    
    for user_id in users_to_recommend :
        
        recommendations = recommender.recommend(user_id_array=user_id,
                                                cutoff=recommendation_length,
                                                #remove_seen=True)
                                               )
        
        submission.append(user_id)
        for item_id in recommendations :
            submission.append(item_id)
   
    return submission

In [92]:
submission = prepare_submission(ratings, users_to_recommend, urm_train_validation,itemCF)

In [93]:
submission

[3409,
 [13649, 2677, 13912, 23225, 15011, 8839, 538, 25547, 19211, 15345],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 3111,
 [12945, 11365, 4351, 14356, 20410, 5930, 6446, 11110, 9298, 14776],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 2660,
 [8544, 20148, 11384, 7639, 15830, 18150, 12710, 9343, 21552, 364],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 903,
 [10269, 9438, 25675, 9851, 10786, 12061, 22554, 13118, 25260, 5942],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 4525,
 [5952, 23154, 25596, 25407, 3737, 2426, 2492, 9555, 1085, 8097],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 7189,
 [6920, 25060, 2982, 14701, 9126, 12995, 20247, 23325, 19790, 13538],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 5188,
 [15130, 17960, 20651, 1249, 17026, 9592, 23001, 5115, 7564, 11089],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 5113,
 [25046, 18293, 8590, 18804, 22991, 7337, 6632, 5380, 14012, 477],
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 1554,
 [16767, 3348, 7788, 7000, 17841, 462, 12966, 18458, 6059, 462

In [74]:
import os
from datetime import datetime

csv_fname = './submission'
csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

def write_submission(submissions):
    with open(csv_fname, "w") as f:
        f.write(f"user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")


In [75]:
write_submission(submission)