# Setting-up environment

In [None]:
!git clone https://github.com/MaurizioFD/RecSys_Course_AT_PoliMi

In [None]:
!mv RecSys_Course_AT_PoliMi/* /kaggle/working/

In [None]:
!cp -r ../input/updated-code-3-2/* /kaggle/working/

In [None]:
!pip install -r requirements.txt

In [None]:
!python run_compile_all_cython.py

# Imports

In [None]:
!pip install lightfm tqdm optuna ipykernel matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as pyplot
import pandas as pd
import scipy.sparse as sps
import matplotlib.pyplot as plt
from scipy.stats import loguniform
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k
import time
import optuna

In [None]:
import os
os.chdir("../input/updated-code-2/")

In [None]:
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.NonPersonalizedRecommender import TopPop
from Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from Recommenders.GraphBased.RP3betaRecommender import RP3betaRecommender
from Recommenders.BaseRecommender import BaseRecommender
from Evaluation.Evaluator import EvaluatorHoldout
from Recommenders.KNN.UserKNNCFRecommender import UserKNNCFRecommender
from Recommenders.KNN.ItemKNNCFRecommender import ItemKNNCFRecommender
from Recommenders.KNN.ItemKNNCustomSimilarityRecommender import ItemKNNCustomSimilarityRecommender
from Recommenders.MatrixFactorization.NMFRecommender import NMFRecommender

#----remove

In [None]:
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender, MultiThreadSLIM_SLIMElasticNetRecommender
from multiprocessing import cpu_count

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sps


In [None]:
seed = 69
np.random.seed(seed)

# Preprocessing

In [None]:
# Import training data
URM_path = "../data-books/data_train.csv"
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path,
                                header=0,
                                dtype={0:int, 1:int, 2:int},
                                engine='python')

URM_all_dataframe.columns = ["user_id", "item_id", "interaction"]

In [None]:
# Import target users
target_path = "../data-books/data_target_users_test.csv"
target_dataframe= pd.read_csv(filepath_or_buffer=target_path,
                                header=0,
                                dtype={0:int},
                                engine='python')
target_dataframe.columns = ["user_id"]
target_dataframe

In [None]:
def preprocess_data(ratings: pd.DataFrame):
    unique_users = ratings.user_id.unique()
    unique_items = ratings.item_id.unique()

    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})

    ratings = pd.merge(left=ratings,
                       right=mapping_user_id,
                       how="inner",
                       on="user_id")

    ratings = pd.merge(left=ratings,
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")

    return ratings

In [None]:
# Call preprocess data function
ratings = preprocess_data(URM_all_dataframe)

## From DF to Sparse matrix

In [None]:
URM = sps.coo_matrix((ratings.interaction.values, (ratings.mapped_user_id.values, ratings.mapped_item_id.values)))

## Data Splitting

In [None]:
urm_train, urm_test = split_train_in_two_percentage_global_sample(URM, train_percentage = 0.80)
urm_train, urm_validation = split_train_in_two_percentage_global_sample(urm_train, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(urm_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(urm_test, cutoff_list=[10])

# Model

In [None]:
class ScoresHybridRecommender(BaseRecommender):
    """ ScoresHybridRecommender
    Hybrid of two prediction scores R = R1*alpha + R2*(1-alpha)

    """

    RECOMMENDER_NAME = "ScoresHybridRecommender"

    def __init__(self, URM_train, recommender_1, recommender_2):
        super(ScoresHybridRecommender, self).__init__(URM_train)

        self.URM_train = sps.csr_matrix(URM_train)
        self.recommender_1 = recommender_1
        self.recommender_2 = recommender_2


    def fit(self, alpha=0.5):
        self.alpha = alpha

    def _compute_item_score(self, user_id_array, items_to_compute):

        # In a simple extension this could be a loop over a list of pretrained recommender objects
        item_weights_1 = self.recommender_1._compute_item_score(user_id_array)
        item_weights_2 = self.recommender_2._compute_item_score(user_id_array)

        item_weights = item_weights_1*self.alpha + item_weights_2*(1-self.alpha)

        return item_weights

In [None]:
class LightFMCFRecommender(BaseRecommender):
    """LightFMCFRecommender"""

    RECOMMENDER_NAME = "LightFMCFRecommender"

    def __init__(self, URM_train):
        super(LightFMCFRecommender, self).__init__(URM_train)
    
    
    def fit(self, epochs = 300, user_alpha=1e-6, item_alpha = 1e-6, n_factors = 10, n_threads = 4, max_sampled=3, loss='warp', learning_schedule='adagrad'):
        
        # Let's fit a WARP model
        self.lightFM_model = LightFM(loss=loss,
                                     user_alpha=user_alpha,
                                     item_alpha=item_alpha,
                                     no_components=n_factors,
                                     max_sampled=max_sampled,
                                     learning_schedule=learning_schedule)

        self.lightFM_model = self.lightFM_model.fit(self.URM_train, 
                                       epochs=epochs,
                                       num_threads=n_threads,
                                       verbose=True)


    def _compute_item_score(self, user_id_array, items_to_compute = None):
        
        # Create a single (n_items, ) array with the item score, then copy it for every user
        items_to_compute = np.arange(self.n_items)
        
        item_scores = - np.ones((len(user_id_array), self.n_items)) * np.inf

        for user_index, user_id in enumerate(user_id_array):
            item_scores[user_index] = self.lightFM_model.predict(int(user_id), 
                                                                 items_to_compute)

        return item_scores

In [None]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

from Recommenders.BaseRecommender import BaseRecommender

class GeneralizedLinearHybridRecommender(BaseRecommender):
    """
    This recommender merges N recommendes by weighting their ratings
    """

    RECOMMENDER_NAME = "GeneralizedLinearHybridRecommender"

    def __init__(self, URM_train, recommenders: list, verbose=True):
        self.RECOMMENDER_NAME = ''
        for recommender in recommenders:
            self.RECOMMENDER_NAME = self.RECOMMENDER_NAME + recommender.RECOMMENDER_NAME[:-11]
        self.RECOMMENDER_NAME = self.RECOMMENDER_NAME + 'HybridRecommender'

        super(GeneralizedLinearHybridRecommender, self).__init__(URM_train, verbose=verbose)

        self.recommenders = recommenders

    def fit(self, alphas=None):
        self.alphas = alphas

    def save_model(self, folder_path, file_name=None):
        pass

    def _compute_item_score(self, user_id_array, items_to_compute=None):
        result = self.alphas[0]*self.recommenders[0]._compute_item_score(user_id_array,items_to_compute)
        for index in range(1,len(self.alphas)):
            result = result + self.alphas[index]*self.recommenders[index]._compute_item_score(user_id_array,items_to_compute)
        return result

# Model params

In [None]:
'''ItemKNN_params = {
    'topK': 11,
    'shrink': 8
}'''

In [None]:
ItemKNN_params = {
    'topK': 6,
    'shrink': 15,
    'similarity': 'jaccard',
    'normalize': False
}

In [None]:
RP3beta_params = {
    'alpha': 0.307953246083667, 
    'beta': 0.3073797221110665, 
    'topK': 59, 
    'normalize_similarity': True
}

In [None]:
#alpha_itemknn_rp3beta=0.8726915476982722

In [None]:
alpha_itemknn_rp3beta=0.7381515719042592

In [None]:
'''UserKNN_params = {
    'shrink':0,
    'topK':313
}'''

In [None]:
UserKNN_params = {
    'topK': 470,
    'shrink': 0,
    'similarity': 'cosine',
    'normalize': True
}

In [None]:
'''LightFM_params = {
    'alpha': 9.874597034935863e-05,
    'n_factors': 365,
    'max_sampled':3,
    'loss':'warp',
    'learning_schedule':'adagrad'
}'''

In [None]:
LightFM_params = {
                  'n_factors': 482,
                  'max_sampled': 5,
                  'user_alpha': 0.00023989649900734266,
                  'item_alpha': 9.740651135253414e-05
                 }

In [None]:
SLIM_params = {
    'l1_ratio': 0.013752256221164005,
    'alpha': 0.0031943927190071775,
    'topK': 622
}

In [None]:
NMF_params = {
    'l1_ratio': 0.005734775635120469,
    'num_factors': 134,
    'beta_loss': 'frobenius',
    'init_type': 'nndsvda',
    'solver': 'multiplicative_update'
}

# Fitting models

In [None]:
itemKNNCF = ItemKNNCFRecommender(urm_train)
itemKNNCF.fit(**ItemKNN_params)

In [None]:
rp3beta = RP3betaRecommender(urm_train)
rp3beta.fit(**RP3beta_params)

In [None]:
new_similarity = (1 - alpha_itemknn_rp3beta) * itemKNNCF.W_sparse + alpha_itemknn_rp3beta * rp3beta.W_sparse
    
itemKNN_rp3beta = ItemKNNCustomSimilarityRecommender(urm_train)
itemKNN_rp3beta.fit(new_similarity)

In [None]:
#userKNNCF = UserKNNCFRecommender(urm_train)
#userKNNCF.fit(**UserKNN_params)

In [None]:
lightfm = LightFMCFRecommender(urm_train)
lightfm.fit(**LightFM_params, n_threads=int(cpu_count()))

In [None]:
slim = MultiThreadSLIM_SLIMElasticNetRecommender(urm_train)
slim.fit(**SLIM_params, workers = int(cpu_count()))

In [None]:
nmf_model = NMFRecommender(urm_train)
nmf_model.fit(**NMF_params, verbose=True)

In [None]:
os.chdir("/kaggle/working/")

# Tuning Alpha for full hybrid

In [None]:
class SaveResults(object):

    def __init__(self):
        self.results_df = pd.DataFrame(columns=["result"])

    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]

        # Create a DataFrame from the current trial's results
        trial_df = pd.DataFrame([hyperparam_dict])

        # Use concat instead of append
        self.results_df = pd.concat([self.results_df, trial_df], ignore_index=True)

In [None]:
recommenders = [itemKNN_rp3beta, slim, lightfm, nmf_model]

In [None]:
def objective_function_hybrid_full(optuna_trial):
    alphas = [
        optuna_trial.suggest_float("alpha_itemKNN_rp3beta", 0, 2),
        optuna_trial.suggest_float("alpha_SLIM", 0, 2),
        optuna_trial.suggest_float("alpha_lightfm", 0, 2),
        optuna_trial.suggest_float("alpha_nmf", 0, 2)
    ]
    recommender_instance = GeneralizedLinearHybridRecommender(URM_train=urm_train, recommenders=recommenders)
    recommender_instance.fit(
                             alphas
                            )

    result_df, _ = evaluator_validation.evaluateRecommender(recommender_instance)

    return result_df.loc[10]["MAP"]

In [None]:
optuna_study_hybrid_full = optuna.create_study(direction="maximize")

save_results_hybrid_full = SaveResults()

optuna_study_hybrid_full.optimize(objective_function_hybrid_full,
                      callbacks=[save_results_hybrid_full],
                      n_trials = 300)

In [None]:
 Trial 43 finished with value: 0.0317516106162691 and parameters: {'alpha_itemKNN_rp3beta': 1.4677480416401336, 'alpha_SLIM': 1.0085607578989502, 'alpha_lightfm': 0.08166189287323965, 'alpha_nmf': 0.005271968578302977}. Best is trial 43 with value: 0.0317516106162691.

In [None]:
pruned_trials = [t for t in optuna_study_hybrid_full.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in optuna_study_hybrid_full.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(optuna_study_hybrid_full.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
print("  Value Validation: ", optuna_study_hybrid_full.best_trial.value)

print("Best params:")
print(optuna_study_hybrid_full.best_trial.params)

# Model Evaluation

In [None]:
list(optuna_study_hybrid_full.best_trial.params.values())

In [None]:
{'alpha_itemKNN_rp3beta': 1.4677480416401336, 'alpha_SLIM': 1.0085607578989502, 'alpha_lightfm': 0.08166189287323965, 'alpha_nmf': 0.005271968578302977}

In [None]:
hybrid = GeneralizedLinearHybridRecommender(URM_train=urm_train+urm_validation, recommenders=recommenders)
hybrid.fit(list(alphas.values())) # TODO fix here if change fit function

In [None]:
result_df, _ = evaluator_test.evaluateRecommender(hybrid)
result_df

# Submission

In [None]:
urm_full = urm_train+urm_validation+urm_test

In [None]:
top_pop_final = TopPop(urm_full)
top_pop_final.fit()

In [None]:
itemKNNCF = ItemKNNCFRecommender(urm_full)
itemKNNCF.fit(**ItemKNN_params)

In [None]:
rp3beta = RP3betaRecommender(urm_full)
rp3beta.fit(**RP3beta_params)

In [None]:
userKNNCF = UserKNNCFRecommender(urm_full)
userKNNCF.fit(**UserKNN_params)

In [None]:
new_similarity = (1 - alpha_itemknn_rp3beta) * itemKNNCF.W_sparse + alpha_itemknn_rp3beta * rp3beta.W_sparse
    
itemKNN_rp3beta = ItemKNNCustomSimilarityRecommender(urm_full)
itemKNN_rp3beta.fit(new_similarity)

In [None]:
lightfm = LightFMCFRecommender(urm_full)
lightfm.fit(**LightFM_params)

In [None]:
slim = MultiThreadSLIM_SLIMElasticNetRecommender(urm_full)
slim.fit(**SLIM_params, workers = int(cpu_count()))

In [None]:
nmf_model = NMFRecommender(urm_full)
nmf_model.fit(**NMF_params, verbose=True)

In [None]:
recommenders = [itemKNN_rp3beta, slim, lightfm, nmf_model]

In [None]:
hybrid = GeneralizedLinearHybridRecommender(URM_train=urm_full, recommenders=recommenders)
hybrid.fit(list(alphas.values())) # TODO fix here if change fit function

In [None]:
def prepare_submission(ratings: pd.DataFrame, users_to_recommend: np.array, urm_train: sps.csr_matrix, recommender: BaseRecommender, recommendation_length):
    users_ids_and_mappings = ratings[ratings.user_id.isin(users_to_recommend)][["user_id", "mapped_user_id"]].drop_duplicates()
    mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))

    submission = []

    rec_users_arr = users_ids_and_mappings.mapped_user_id.to_numpy()
    recommendations = recommender.recommend(user_id_array= rec_users_arr, cutoff=recommendation_length)

    i = 0

    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id

        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations[i]]))
        i+=1

    return submission

In [None]:
users_to_recommend = target_dataframe.to_numpy().flatten()
users_to_recommend.shape

## Mappings

In [None]:
mapping_to_item_id = dict(zip(ratings.mapped_item_id, ratings.item_id))

In [None]:
urm_ids = np.unique(ratings.user_id)
missing_users = set([i for i in users_to_recommend]) - set([i for i in urm_ids])
missing_users = np.array([x for x in missing_users])

# Generation

In [None]:
!mkdir Results

In [None]:
def write_submission(submissions, name):
    with open(f"/kaggle/working/Results/{name}.csv", "w") as f:
        f.write("user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")
        f.close()

In [None]:
submission = prepare_submission(ratings, users_to_recommend, urm_full, hybrid, recommendation_length=10)

In [None]:
rec_missing = top_pop_final.recommend(missing_users, cutoff=10, remove_seen_flag=False)

In [None]:
for user_id in missing_users:
  submission.append((user_id, [mapping_to_item_id[item_id] for item_id in rec_missing[0]]))

In [None]:
(len(submission), len(users_to_recommend))

In [None]:
write_submission(submission, "submission_hybrid_4-fold_slim_nmf")