### Importing of libraries

In [16]:
import os
from typing import Tuple, Callable, Dict, Optional, List

import time
import numpy as np
import pandas as pd
import scipy.sparse as sps

from sklearn.model_selection import train_test_split

### Dataset Loading

In [17]:
from modUtils import dataManager as dm

urm_matrix = dm.load_data()
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating
0,0,10080,1
1,0,19467,1
2,1,2665,1
3,1,7494,1
4,1,17068,1
...,...,...,...
113263,7945,2476,1
113264,7945,12319,1
113265,7945,21384,1
113266,7946,8699,1


### Data Preprocessing

In [18]:
urm_matrix = dm.preprocess_data(urm_matrix)

Numero di users: 7947, UserId minimo: 0, UserId massimo: 7946
Numero di items: 24896, ItemId minimo: 0, ItemId massimo: 25974
Sparsity della URM: 0.057 %


In [19]:
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating,mapped_user_id,mapped_item_id
0,0,10080,1,0,0
1,4342,10080,1,4342,0
2,5526,10080,1,5526,0
3,5923,10080,1,5923,0
4,0,19467,1,0,1
...,...,...,...,...,...
113263,7944,22542,1,7944,24891
113264,7944,24806,1,7944,24892
113265,7944,24912,1,7944,24893
113266,7944,24990,1,7944,24894


### Dataset Splitting (NO MAPPING)

In [20]:
def dataset_splits(ratings, num_users, num_items, val_perc: float) -> \
        (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """DIVIDE IL DATASET DEL URM IN TRAINING, VALUATION AND TEST SETS SENZA MAPPING"""
    seed = 9876

    (uid_training, uid_val,
     iid_training, iid_val,
     ratings_training, ratings_val) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.impl_rating,
                                                        test_size=val_perc,
                                                        shuffle=True,
                                                        random_state=seed)

    urm_train = sps.csr_matrix((ratings_training, (uid_training, iid_training)), shape=(7947,25975))
    urm_val = sps.csr_matrix((ratings_val, (uid_val, iid_val)), shape=(7947,25975))

    return urm_train, urm_val

In [21]:
urm_train,urm_val = dataset_splits(urm_matrix, 
                                    num_users=7947, 
                                    num_items=24896, 
                                    val_perc=0.2)

In [22]:
urm_train

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 90614 stored elements in Compressed Sparse Row format>

In [23]:
urm_val

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 22654 stored elements in Compressed Sparse Row format>

### Evaluator

In [24]:
def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def evaluate(recommender: object, urm_train: sps.csr_matrix, urm_val: sps.csr_matrix) -> float:
    """VALUTA UN RECOMMENDER SYSTEM E RITORNA I VALORI DI MAP"""

    recommendation_length = 10
    accum_map = 0

    num_users = urm_train.shape[0]
    num_users_evaluated = 0

    for user_id in range(num_users):
        user_profile_start = urm_val.indptr[user_id]
        user_profile_end = urm_val.indptr[user_id + 1]

        relevant_items = urm_val.indices[user_profile_start:user_profile_end]
        if relevant_items.size == 0:
            continue

        recommendations = recommender.recommend(user_id=user_id,
                                                at=recommendation_length,
                                                urm_train=urm_train)

        accum_map += mean_average_precision(recommendations, relevant_items)
        num_users_evaluated += 1

    accum_map /= max(num_users_evaluated, 1)

    return accum_map

### Item-Based SLIM Recommender

In [25]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [26]:
%%cython
import numpy as np
import time
from libc.stdlib cimport rand, srand, RAND_MAX

def train_multiple_epochs(URM_train,simil_matrix, loss_value, learning_rate_input):

    URM_train_coo = URM_train.tocoo()
    cdef int n_items = URM_train.shape[1]
    cdef int n_interactions = URM_train.nnz
    cdef int[:] URM_train_row = URM_train_coo.row
    cdef int[:] URM_train_col = URM_train_coo.col
    cdef int[:] URM_train_data = URM_train_coo.data
    cdef int[:] URM_train_indices = URM_train.indices
    cdef int[:] URM_train_indptr = URM_train.indptr

    cdef double[:,:] item_item_S = simil_matrix
    cdef double learning_rate = learning_rate_input
    cdef double loss = loss_value
    cdef long start_time
    cdef double rating, predicted_rating, prediction_error
    cdef int start_profile, end_profile
    cdef int index, sample_num, user_id, item_id, seen_item_id
  
    for sample_num in range(n_interactions):

        # Randomly pick sample
        index = rand() % n_interactions

        user_id = URM_train_row[index]
        item_id = URM_train_col[index]
        rating = URM_train_data[index]

        # Compute prediction
        start_profile = URM_train_indptr[user_id]
        end_profile = URM_train_indptr[user_id+1]
        predicted_rating = 0.0

        for index in range(start_profile, end_profile):
            seen_item_id = URM_train_indices[index]
            predicted_rating += item_item_S[seen_item_id,item_id]

        # Compute prediction error, or gradient
        prediction_error = rating - predicted_rating
        loss += prediction_error**2

        # Update model, in this case the similarity
        for index in range(start_profile, end_profile):
            seen_item_id = URM_train_indices[index]
            item_item_S[seen_item_id,item_id] += prediction_error * learning_rate

    return np.array(item_item_S,dtype=np.float),loss

In [27]:
from Base.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

class ItemKNNSLIMRecommender(object):

    def __init__(self, URM):
        self.URM = URM
        n_items = URM.shape[1]
        self.W_sparse = np.zeros((n_items, n_items), dtype = np.float)

    def fit(self,validation_set, learning_rate, n_epochs, early_stopping = False, max_out = 5):
        max_map = 0
        prev_map = 0
        n_out = 0
        W_best = self.W_sparse.copy()
        for n_epoch in range(n_epochs):
            loss = 0.0
            start_time = time.time()
            self.W_sparse,loss = train_multiple_epochs(self.URM,self.W_sparse, loss, learning_rate)
            sample_num = self.URM.nnz
            elapsed_time = time.time() - start_time
            samples_per_second = sample_num/elapsed_time
            print("Epoch {} complete in in {:.2f} seconds, loss is {:.3E}. Samples per second {:.2f}".format(n_epoch+1, time.time() - start_time, loss/sample_num, samples_per_second))
            map_val = evaluate(self,self.URM,validation_set)
            print("Validation Complete - MAP = {:.5f}".format(map_val))
            if map_val > max_map:
                n_out=0
                max_map = map_val
                W_best = self.W_sparse.copy()
            elif map_val < prev_map:
                n_out+=1
                if early_stopping and n_out>max_out:
                    print(f"Applying Early Stopping - Stop to Epoch={n_epoch+1} and MAP={max_map}")
                    self.W_sparse = W_best
                    break
            prev_map = map_val
        return max_map


    def recommend(self, user_id: int, urm_train: sps.csr_matrix, at: Optional[int] = None, remove_unseen: bool = True):
        # compute the scores using the dot product
        user_profile = urm_train[user_id]
        scores = user_profile.dot(self.W_sparse).ravel()

        if remove_unseen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def filter_seen(self, user_id, scores):
        start_pos = urm_train.indptr[user_id]
        end_pos = urm_train.indptr[user_id + 1]

        user_profile = urm_train.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [28]:
itemslimknn_recommender = ItemKNNSLIMRecommender(URM=urm_train)

In [29]:
_ = itemslimknn_recommender.fit(validation_set = urm_val,
                                learning_rate = 1e-3,
                                n_epochs = 1,
                                early_stopping = False,
                                max_out = 5)

Epoch 1 complete in in 15.89 seconds, loss is 8.152E-01. Samples per second 5702.00
Validation Complete - MAP = 0.02776


In [30]:
for user_id in range(10):
    print(itemslimknn_recommender.recommend(user_id=user_id,
                                  at=10, 
                                  urm_train=urm_train))

[19704 20108 20711 25582   544  3169  7639  8544  1611  9726]
[23600 19709  3570 12409 18317 20095 14895 19089 22558 24075]
[10676 15835 14359     2  9635 19910  8251 17264 11006 17783]
[25974  8660  8650  8651  8652  8653  8654  8655  8656  8657]
[21552 11384  2839  7124  3169 11658 24908 11452 15327 21464]
[ 8097  5044 19709  7494 15691  2665  1240  7013   627 11900]
[18173 25044  8894  9090 18798 14356 12227 25407 24075   681]
[18527 21270  1217  8808   133 20955  6124 11141 13840 12252]
[22848 17819  8990 12466 18392 21123 12319 11834  1240 17877]
[20856 14383 22788 16830 17898 21291 22522 19996 17350 15320]


### Hyperparameter Tuning

In [31]:
def hyperparameter_tuning(val_set,n_epochs = 200, early_stopping = True, max_out = 10):
    results = []
    values = [1e-3,3e-3,5e-3,3e-2,3e-1]
    for lrate in values:
        print(f"Currently trying learning rate = {lrate}")
        
        recommender = ItemKNNSLIMRecommender(URM=urm_train)
        
        ev_map = recommender.fit(validation_set = val_set,
                                 learning_rate = lrate,
                                 n_epochs = n_epochs,
                                 early_stopping = early_stopping,
                                 max_out = max_out)
        
        results.append((lrate, ev_map))      
    return results

### What is the better learning rate?

In [32]:
hyperparameter_results = hyperparameter_tuning(val_set = urm_val)

Currently trying learning rate = 0.001
Epoch 1 complete in in 19.91 seconds, loss is 8.133E-01. Samples per second 4550.09
Validation Complete - MAP = 0.02896


KeyboardInterrupt: 

In [None]:
hyperparameter_results

### Submission to competition

In [None]:
urm_train_validation = urm_train + urm_val

In [None]:
best_recommender = ItemKNNSLIMRecommender(URM=urm_train)
best_recommender.fit(validation_set = urm_val,
                     learning_rate = "DA_METTERE",
                     n_epochs = 200,
                     early_stopping = True,
                     max_out = 10)

In [None]:
def prepare_submission(urm_train: sps.csr_matrix, recommender: object) -> list:
    """CREA LA LISTA DI TUPLE (USERID,LISTA DI ITEMIDS RACCOMANDATI)"""
    users_to_recommend = pd.read_csv("./data/data_target_users_test.csv",
                                     names=["user_id"],
                                     header=None,
                                     skiprows=1,
                                     dtype={"user_id": np.int32})

    recommendation_length = 10
    submission = []
    for user_id in users_to_recommend.user_id:
        recommendations = recommender.recommend(user_id=user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length)
        submission.append((user_id, recommendations))

    return submission

In [None]:
submission = prepare_submission(urm_train_validation, best_recommender)

In [None]:
submission

In [None]:
from datetime import date
today = date.today().strftime("%d-%m-%y")
dm.write_submission(submission, "ItemKNNSLIMRecommender-"+today)