### Importing of libraries

In [1]:
import os
from typing import Tuple, Callable, Dict, Optional, List

import numpy as np
import pandas as pd
import scipy.sparse as sps

from sklearn.model_selection import train_test_split

### Dataset Loading

In [2]:
from modUtils import dataManager as dm

urm_matrix = dm.load_data()
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating
0,0,10080,1
1,0,19467,1
2,1,2665,1
3,1,7494,1
4,1,17068,1
...,...,...,...
113263,7945,2476,1
113264,7945,12319,1
113265,7945,21384,1
113266,7946,8699,1


### Data Preprocessing

In [3]:
urm_matrix = dm.preprocess_data(urm_matrix)

Numero di users: 7947, UserId minimo: 0, UserId massimo: 7946
Numero di items: 24896, ItemId minimo: 0, ItemId massimo: 25974
Sparsity della URM: 0.057 %


In [4]:
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating,mapped_user_id,mapped_item_id
0,0,10080,1,0,0
1,4342,10080,1,4342,0
2,5526,10080,1,5526,0
3,5923,10080,1,5923,0
4,0,19467,1,0,1
...,...,...,...,...,...
113263,7944,22542,1,7944,24891
113264,7944,24806,1,7944,24892
113265,7944,24912,1,7944,24893
113266,7944,24990,1,7944,24894


### Dataset Splitting (NO MAPPING)

In [5]:
def dataset_splits(ratings, num_users, num_items, val_perc: float) -> \
        (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """DIVIDE IL DATASET DEL URM IN TRAINING, VALUATION AND TEST SETS SENZA MAPPING"""
    seed = 9876

    (uid_training, uid_val,
     iid_training, iid_val,
     ratings_training, ratings_val) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.impl_rating,
                                                        test_size=val_perc,
                                                        shuffle=True,
                                                        random_state=seed)

    urm_train = sps.csr_matrix((ratings_training, (uid_training, iid_training)), shape=(7947,25975))
    urm_val = sps.csr_matrix((ratings_val, (uid_val, iid_val)), shape=(7947,25975))

    return urm_train, urm_val

In [6]:
urm_train,urm_val = dataset_splits(urm_matrix, 
                                    num_users=7947, 
                                    num_items=24896, 
                                    val_perc=0.2)

In [7]:
urm_train

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 90614 stored elements in Compressed Sparse Row format>

In [8]:
urm_val

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 22654 stored elements in Compressed Sparse Row format>

### Item-Based Collaborative Filtering Recommender

In [9]:
from Base.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

class ItemKNNCFRecommender(object):

    def __init__(self, URM):
        self.URM = URM

    def fit(self, topK=50, shrink=100, normalize=False, similarity="cosine"):
        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

    def recommend(self, user_id: int, urm_train: sps.csr_matrix, at: Optional[int] = None, remove_unseen: bool = True):
        # compute the scores using the dot product
        user_profile = urm_train[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if remove_unseen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def filter_seen(self, user_id, scores):
        start_pos = urm_train.indptr[user_id]
        end_pos = urm_train.indptr[user_id + 1]

        user_profile = urm_train.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [10]:
itemcfknn_recommender = ItemKNNCFRecommender(URM=urm_train)

In [11]:
%%time

itemcfknn_recommender.fit()

Similarity column 25975 ( 100 % ), 2662.76 column/sec, elapsed time 0.16 min
Wall time: 10.6 s


In [12]:
for user_id in range(10):
    print(itemcfknn_recommender.recommend(user_id=user_id,
                                  at=10, 
                                  urm_train=urm_train))

[ 8887 21102 24902   649 11310  8486 22469 19927 13560 23885]
[19089 23600 12409 19709  8894  3165 20095 16630 18317  9438]
[22230 14031 11548 17912 17336 17815  6696 14359 12352  9549]
[24093  5412 18569  2442 11341 16084  9087  2074 15550 22277]
[ 9007  1611 19704 11796  9243  6812  1051 18692 21365 21150]
[ 8097  7494 19709 19089 22445 23600 11535  8894 19480  5044]
[23906 14356  1848 25044 18173  1136 13102  4824  6120  9989]
[ 1532  4209  1467 11141 12556  7224 17305 15214 22772  5976]
[ 1583  2426 10594  6878 23154 17156  6734  2551 21123 18392]
[23600 15562  9609   916 20988  2548 19524  8785 10273 20856]


### Evaluator

In [13]:
from modUtils import evaluator as ev

In [14]:
%%time

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = ev.evaluate(itemcfknn_recommender,
                                                                                            urm_train,
                                                                                            urm_val)

Wall time: 21 s


In [15]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.029191991419377377, 0.0960005574332159, 0.047424180172075434, 5594, 2353)

### Hyperparameter Tuning

In [16]:
def hyperparameter_tuning(hyperparam: str, **kargs):
    """
    :param hyperparam:      "normalized"   
                            "similarity"    
                            "shrink"       
                            "topK"
                            
    :param kargs:     topK: int, shrink: int, normalize: boolean, similarity: str     
    """
    results = []
    if hyperparam == "normalized":
        values = [False,True]
        for norm in values:
            print(f"Currently trying normalized {norm}")
            recommender = ItemKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=norm, similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((norm, ev_map))
    elif hyperparam == "similarity":
        similarities = ["cosine","jaccard", "dice","tversky", "tanimoto"]
        for similarity in similarities:
            print(f"Currently trying similarity {similarity}")
            recommender = ItemKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=similarity)
            _, _, ev_map,_,_ = ev.evaluate(recommender, urm_train, urm_val)
            results.append((similarity, ev_map))
    elif hyperparam == "shrink":
        shrinks = range(65,76)
        for shrink in shrinks:
            print(f"Currently trying shrink {shrink}")
            recommender = ItemKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=shrink, normalize=kargs['normalize'], similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((shrink,ev_map))        
    elif hyperparam == "topK":
        values = list(range(50,201,10))
        for topK in values:
            print(f"Currently trying topK {topK}")
            recommender = ItemKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=topK, shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((topK,ev_map))   
        
    return results

### Normalize or Not?

In [17]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="normalized",
                                               topK=50,
                                               shrink=100,
                                               normalize=None,
                                               similarity="cosine")

Currently trying normalized False
Similarity column 25975 ( 100 % ), 2504.52 column/sec, elapsed time 0.17 min
Currently trying normalized True
Similarity column 25975 ( 100 % ), 2551.12 column/sec, elapsed time 0.17 min
Wall time: 1min 2s


In [18]:
hyperparameter_results

[(False, 0.047424180172075434), (True, 0.049382332113250105)]

### Which type of similarity?

In [19]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="similarity",
                                               topK=50,
                                               shrink=100,
                                               normalize=True,
                                               similarity=None)

Currently trying similarity cosine
Similarity column 25975 ( 100 % ), 2418.02 column/sec, elapsed time 0.18 min
Currently trying similarity jaccard
Similarity column 25975 ( 100 % ), 2131.30 column/sec, elapsed time 0.20 min
Currently trying similarity dice
Similarity column 25975 ( 100 % ), 2559.15 column/sec, elapsed time 0.17 min
Currently trying similarity tversky
Similarity column 25975 ( 100 % ), 1512.69 column/sec, elapsed time 0.29 min
Currently trying similarity tanimoto
Similarity column 25975 ( 100 % ), 2088.74 column/sec, elapsed time 0.21 min
Wall time: 2min 47s


In [20]:
hyperparameter_results

[('cosine', 0.049382332113250105),
 ('jaccard', 0.04921968948797221),
 ('dice', 0.04921044174764135),
 ('tversky', 0.04921968948797221),
 ('tanimoto', 0.04921968948797221)]

### What is the optimal value of shrink?

In [21]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="shrink",
                                               topK=50,
                                               shrink=None,
                                               normalize=True,
                                               similarity="cosine")

Currently trying shrink 65
Similarity column 25975 ( 100 % ), 2428.15 column/sec, elapsed time 0.18 min
Currently trying shrink 66
Similarity column 25975 ( 100 % ), 2408.40 column/sec, elapsed time 0.18 min
Currently trying shrink 67
Similarity column 25975 ( 100 % ), 2422.24 column/sec, elapsed time 0.18 min
Currently trying shrink 68
Similarity column 25975 ( 100 % ), 2333.90 column/sec, elapsed time 0.19 min
Currently trying shrink 69
Similarity column 25975 ( 100 % ), 2430.12 column/sec, elapsed time 0.18 min
Currently trying shrink 70
Similarity column 25975 ( 100 % ), 2515.64 column/sec, elapsed time 0.17 min
Currently trying shrink 71
Similarity column 25975 ( 100 % ), 2348.68 column/sec, elapsed time 0.18 min
Currently trying shrink 72
Similarity column 25975 ( 100 % ), 2397.76 column/sec, elapsed time 0.18 min
Currently trying shrink 73
Similarity column 25975 ( 100 % ), 2286.61 column/sec, elapsed time 0.19 min
Currently trying shrink 74
Similarity column 25975 ( 100 % ), 22

In [22]:
hyperparameter_results

[(65, 0.04983249929760425),
 (66, 0.04962338525566338),
 (67, 0.049877351280161575),
 (68, 0.049767507628390585),
 (69, 0.04974886520538633),
 (70, 0.04968555962918003),
 (71, 0.049692423975972774),
 (72, 0.04972217371998908),
 (73, 0.0498023403388685),
 (74, 0.049694085888238294),
 (75, 0.049731360093594364)]

### What is the optimal value of topK?

In [23]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="topK",
                                               topK=None,
                                               shrink=67,
                                               normalize=True,
                                               similarity="cosine")

Currently trying topK 50
Similarity column 25975 ( 100 % ), 2456.79 column/sec, elapsed time 0.18 min
Currently trying topK 60
Similarity column 25975 ( 100 % ), 2527.12 column/sec, elapsed time 0.17 min
Currently trying topK 70
Similarity column 25975 ( 100 % ), 2512.25 column/sec, elapsed time 0.17 min
Currently trying topK 80
Similarity column 25975 ( 100 % ), 2495.41 column/sec, elapsed time 0.17 min
Currently trying topK 90
Similarity column 25975 ( 100 % ), 2461.90 column/sec, elapsed time 0.18 min
Currently trying topK 100
Similarity column 25975 ( 100 % ), 2450.32 column/sec, elapsed time 0.18 min
Currently trying topK 110
Similarity column 25975 ( 100 % ), 2439.07 column/sec, elapsed time 0.18 min
Currently trying topK 120
Similarity column 25975 ( 100 % ), 2420.94 column/sec, elapsed time 0.18 min
Currently trying topK 130
Similarity column 25975 ( 100 % ), 2420.71 column/sec, elapsed time 0.18 min
Currently trying topK 140
Similarity column 25975 ( 100 % ), 2388.09 column/se

In [24]:
hyperparameter_results

[(50, 0.049877351280161575),
 (60, 0.04999342396421746),
 (70, 0.050233507934616235),
 (80, 0.05047950765735117),
 (90, 0.050352268171599414),
 (100, 0.05082419546196098),
 (110, 0.0509450104708523),
 (120, 0.050863538570461955),
 (130, 0.05083768077173525),
 (140, 0.050697854407590066),
 (150, 0.050771889843146976),
 (160, 0.05093109306051096),
 (170, 0.051320385410285024),
 (180, 0.05074233706379957),
 (190, 0.05112974435725629),
 (200, 0.05081828187716502)]

### Submission to competition

In [25]:
urm_train_validation = urm_train + urm_val

In [26]:
best_recommender = ItemKNNCFRecommender(URM=urm_train_validation)
best_recommender.fit(normalize=True,
                    topK=170,
                    shrink=67,
                    similarity="cosine")

Similarity column 25975 ( 100 % ), 2253.18 column/sec, elapsed time 0.19 min


In [27]:
def prepare_submission(urm_train: sps.csr_matrix, recommender: object) -> list:
    """CREA LA LISTA DI TUPLE (USERID,LISTA DI ITEMIDS RACCOMANDATI)"""
    users_to_recommend = pd.read_csv("./data/data_target_users_test.csv",
                                     names=["user_id"],
                                     header=None,
                                     skiprows=1,
                                     dtype={"user_id": np.int32})

    recommendation_length = 10
    submission = []
    for user_id in users_to_recommend.user_id:
        recommendations = recommender.recommend(user_id=user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length)
        submission.append((user_id, recommendations))

    return submission

In [28]:
submission = prepare_submission(urm_train_validation, best_recommender)

In [29]:
submission

[(0,
  array([ 5085, 25878,   637,  1447, 22850,  5209, 13657,  8486,   909,
         18390], dtype=int64)),
 (1,
  array([19089, 12409, 23600, 19709,  8894,  3165, 16630,  8431, 20095,
         19480], dtype=int64)),
 (2,
  array([ 3164,  1876, 15427, 14031, 21626, 19515, 11889,  4842,  8374,
         19473], dtype=int64)),
 (3,
  array([25892, 22487, 12473,  4175,    57,  5412, 18569, 18374, 17820,
         24093], dtype=int64)),
 (4,
  array([ 9007,  9243, 11792,  1611, 18692, 11384, 11067, 19704, 19781,
         11796], dtype=int64)),
 (5,
  array([ 8097,  7494, 12214,  4257, 15691, 19874, 17819, 11535, 19709,
         25407], dtype=int64)),
 (6,
  array([24383, 25044, 13102, 23906,  4824, 14356, 15130,  1848, 18173,
          1136], dtype=int64)),
 (7,
  array([14846, 24783,  4102, 21950,  4209, 22862, 16257,  1532, 20955,
         23155], dtype=int64)),
 (8,
  array([ 2426, 25407, 15691,  1583, 10834, 21123,  6878, 23154, 18392,
         16928], dtype=int64)),
 (9,
  array([ 8059

In [None]:
from datetime import date
today = date.today().strftime("%d-%m-%y")
dm.write_submission(submission, "ItemKNNCFRecommender-"+today)

### User-Based Collaborative Filtering Recommender

In [30]:
from Base.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

class UserKNNCFRecommender(object):

    def __init__(self, URM: sps.csr_matrix):
        self.URM = URM
        
    def use_idf(self):
        """Considero gli items come features degli utenti ed eseguo una feature selection con weigths usando il metodo TF-IDF"""
        num_tot_users = self.URM.shape[0]
        users_per_item = (self.URM > 0).sum(axis=0) + 1e-6
        IDF = np.array(np.log(num_tot_users / users_per_item))[0]
        col_nnz = np.diff(sps.csc_matrix(self.URM).indptr)
        self.URM.data = (self.URM.data*np.repeat(IDF, col_nnz))     

    def fit(self, topK=50, shrink=100, normalize=True, similarity="cosine", idf=False):
        if (idf) : self.use_idf()
        similarity_object = Compute_Similarity_Python(self.URM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

    def recommend(self, user_id: int, urm_train: sps.csr_matrix, at: Optional[int] = None, remove_unseen: bool = True):
        # compute the scores using the dot product
        scores = self.W_sparse[user_id, :].dot(self.URM).toarray().ravel()

        if remove_unseen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def filter_seen(self, user_id, scores):
        start_pos = urm_train.indptr[user_id]
        end_pos = urm_train.indptr[user_id + 1]

        user_profile = urm_train.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [31]:
usercfknn_recommender = UserKNNCFRecommender(URM=urm_train)
# usercfknn_recommender.use_idf()

In [32]:
%%time

usercfknn_recommender.fit()

Similarity column 7947 ( 100 % ), 2817.63 column/sec, elapsed time 0.05 min
Wall time: 3.05 s


In [33]:
for user_id in range(10):
    print(usercfknn_recommender.recommend(user_id=user_id,
                                  at=10, 
                                  urm_train=urm_train))

[25974  8660  8650  8651  8652  8653  8654  8655  8656  8657]
[19089 23600 20095  3165 20982 24128 19525 19709  5044  6076]
[ 4842 13251 22230  9549  5396 17336 11548  6242 17959  8653]
[ 5412 24093 18569 24075  3942 22487 20905  2074 21552  9591]
[10997 14271  1611 18632  4927 18692  9243 19843  4437 21142]
[11535  8322  1240 12978 14706  8097 22445  7013  6620 12214]
[23906 25044 14354  4824  1288  4774  7049 14020  1246 24383]
[17305 23267 11159 17322  4747  4911  4910 18597 23885  4521]
[18984 10594  2426  8272  5966   823 22848 25511 13711 25407]
[ 9609 22619 11550 13132 23324 18926  1470 21671  3891  4437]


### Evaluator

In [34]:
from modUtils import evaluator as ev

In [35]:
%%time

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = ev.evaluate(usercfknn_recommender,
                                                                                            urm_train,
                                                                                            urm_val)

Wall time: 6.13 s


In [36]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.03146228101537306, 0.09876424396954941, 0.050394297256634815, 5594, 2353)

### Hyperparameter Tuning

In [37]:
def hyperparameter_tuning(hyperparam: str, **kargs):
    """
    :param hyperparam:      "normalized"   
                            "idf"
                            "similarity"    
                            "shrink"       
                            "topK"
                            
    :param kargs:     topK: int, shrink: int, normalize: boolean, similarity: str, idf: bool    
    """
    results = []
    if hyperparam == "normalized":
        values = [False,True]
        for norm in values:
            print(f"Currently trying normalized {norm}")
            recommender = UserKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=norm, similarity=kargs['similarity'],
                            idf=kargs['idf'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((norm, ev_map))
    elif hyperparam == "idf":
        values = [False,True]
        for idf in values:
            print(f"Currently trying idf = {idf}")
            recommender = UserKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=kargs['normalize'], 
                            similarity=kargs['similarity'],
                            idf=idf)
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((idf, ev_map))
    elif hyperparam == "similarity":
        similarities = ["cosine","jaccard", "dice","tversky", "tanimoto"]
        for similarity in similarities:
            print(f"Currently trying similarity {similarity}")
            recommender = UserKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=similarity,
                            idf=kargs['idf'])
            _, _, ev_map,_,_ = ev.evaluate(recommender, urm_train, urm_val)
            results.append((similarity, ev_map))
    elif hyperparam == "shrink":
        shrinks = range(1,25)
        for shrink in shrinks:
            print(f"Currently trying shrink {shrink}")
            recommender = UserKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=kargs['topK'], shrink=shrink, normalize=kargs['normalize'], similarity=kargs['similarity'],
                            idf=kargs['idf'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((shrink,ev_map))        
    elif hyperparam == "topK":
        values = list(range(190,301,10))
        for topK in values:
            print(f"Currently trying topK {topK}")
            recommender = UserKNNCFRecommender(URM=urm_train)
            recommender.fit(topK=topK, shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=kargs['similarity'],
                            idf=kargs['idf'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((topK,ev_map))   
        
    return results

### Normalize or Not?

In [38]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="normalized",
                                               topK=50,
                                               shrink=100,
                                               normalize=None,
                                               similarity="cosine",
                                               idf=False)

Currently trying normalized False
Similarity column 7947 ( 100 % ), 3105.36 column/sec, elapsed time 0.04 min
Currently trying normalized True
Similarity column 7947 ( 100 % ), 2900.71 column/sec, elapsed time 0.05 min
Wall time: 17.9 s


In [39]:
hyperparameter_results

[(False, 0.04060372455037436), (True, 0.050394297256634815)]

### Which type of similarity?

In [40]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="similarity",
                                               topK=50,
                                               shrink=100,
                                               normalize=True,
                                               similarity=None,
                                               idf=False)

Currently trying similarity cosine
Similarity column 7947 ( 100 % ), 2766.64 column/sec, elapsed time 0.05 min
Currently trying similarity jaccard
Similarity column 7947 ( 100 % ), 2649.88 column/sec, elapsed time 0.05 min
Currently trying similarity dice
Similarity column 7947 ( 100 % ), 2882.87 column/sec, elapsed time 0.05 min
Currently trying similarity tversky
Similarity column 7947 ( 100 % ), 2344.27 column/sec, elapsed time 0.06 min
Currently trying similarity tanimoto
Similarity column 7947 ( 100 % ), 2722.33 column/sec, elapsed time 0.05 min
Wall time: 46.8 s


In [41]:
hyperparameter_results

[('cosine', 0.050394297256634815),
 ('jaccard', 0.05193837617609042),
 ('dice', 0.05186638546293662),
 ('tversky', 0.05193837617609042),
 ('tanimoto', 0.05193837617609042)]

### What is the optimal value of shrink?

In [42]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="shrink",
                                               topK=50,
                                               shrink=None,
                                               normalize=True,
                                               similarity="dice",
                                               idf=False)

Currently trying shrink 1
Similarity column 7947 ( 100 % ), 2880.82 column/sec, elapsed time 0.05 min
Currently trying shrink 2
Similarity column 7947 ( 100 % ), 2758.11 column/sec, elapsed time 0.05 min
Currently trying shrink 3
Similarity column 7947 ( 100 % ), 2869.38 column/sec, elapsed time 0.05 min
Currently trying shrink 4
Similarity column 7947 ( 100 % ), 2870.41 column/sec, elapsed time 0.05 min
Currently trying shrink 5
Similarity column 7947 ( 100 % ), 2771.56 column/sec, elapsed time 0.05 min
Currently trying shrink 6
Similarity column 7947 ( 100 % ), 2856.83 column/sec, elapsed time 0.05 min
Currently trying shrink 7
Similarity column 7947 ( 100 % ), 2873.47 column/sec, elapsed time 0.05 min
Currently trying shrink 8
Similarity column 7947 ( 100 % ), 2892.28 column/sec, elapsed time 0.05 min
Currently trying shrink 9
Similarity column 7947 ( 100 % ), 2844.76 column/sec, elapsed time 0.05 min
Currently trying shrink 10
Similarity column 7947 ( 100 % ), 2884.92 column/sec, e

In [43]:
hyperparameter_results

[(1, 0.05392288293016847),
 (2, 0.053900673749219925),
 (3, 0.05389349305414229),
 (4, 0.05378020340738675),
 (5, 0.054150098776782035),
 (6, 0.05412673325530627),
 (7, 0.053897967786274883),
 (8, 0.053605671795393446),
 (9, 0.0534526024918898),
 (10, 0.05376373989723532),
 (11, 0.05383971393017008),
 (12, 0.05393062783317215),
 (13, 0.05368843630985139),
 (14, 0.05335651996235924),
 (15, 0.05350068289905493),
 (16, 0.053418703315685656),
 (17, 0.0534422732896968),
 (18, 0.053531052728497334),
 (19, 0.05343695251309898),
 (20, 0.05345833929547646),
 (21, 0.05388173755412976),
 (22, 0.05392528034234371),
 (23, 0.05362522987750143),
 (24, 0.053437670644536396)]

### What is the optimal value of topK?

In [44]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="topK",
                                               topK=None,
                                               shrink=7,
                                               normalize=True,
                                               similarity="dice",
                                               idf=False)

Currently trying topK 190
Similarity column 7947 ( 100 % ), 2674.85 column/sec, elapsed time 0.05 min
Currently trying topK 200
Similarity column 7947 ( 100 % ), 2630.65 column/sec, elapsed time 0.05 min
Currently trying topK 210
Similarity column 7947 ( 100 % ), 2742.03 column/sec, elapsed time 0.05 min
Currently trying topK 220
Similarity column 7947 ( 100 % ), 2836.65 column/sec, elapsed time 0.05 min
Currently trying topK 230
Similarity column 7947 ( 100 % ), 2841.56 column/sec, elapsed time 0.05 min
Currently trying topK 240
Similarity column 7947 ( 100 % ), 2759.09 column/sec, elapsed time 0.05 min
Currently trying topK 250
Similarity column 7947 ( 100 % ), 2770.63 column/sec, elapsed time 0.05 min
Currently trying topK 260
Similarity column 7947 ( 100 % ), 2754.32 column/sec, elapsed time 0.05 min
Currently trying topK 270
Similarity column 7947 ( 100 % ), 2726.05 column/sec, elapsed time 0.05 min
Currently trying topK 280
Similarity column 7947 ( 100 % ), 2705.72 column/sec, el

In [45]:
hyperparameter_results

[(190, 0.056047726884156636),
 (200, 0.05629240505475352),
 (210, 0.05583320238011917),
 (220, 0.05579680003266285),
 (230, 0.05549742448920138),
 (240, 0.055299089202942416),
 (250, 0.05527648217982363),
 (260, 0.05520658243931592),
 (270, 0.05511175826545612),
 (280, 0.055211064152617166),
 (290, 0.055137211128244275),
 (300, 0.055018001225172586)]

### Is it better using idf or not?

In [46]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="idf",
                                               topK=280,
                                               shrink=7,
                                               normalize=True,
                                               similarity="dice",
                                               idf=None)

Currently trying idf = False
Similarity column 7947 ( 100 % ), 2697.59 column/sec, elapsed time 0.05 min
Currently trying idf = True
Similarity column 7947 ( 100 % ), 2762.92 column/sec, elapsed time 0.05 min
Wall time: 20.2 s


In [47]:
hyperparameter_results

[(False, 0.055211064152617166), (True, 0.055119550206899334)]

### Submission to competition

In [48]:
best= {}

urm_train_validation = urm_train + urm_val

In [49]:
best_recommender = UserKNNCFRecommender(URM=urm_train_validation)
best_recommender.fit(normalize=True,
                    topK=280,
                    shrink=7,
                    similarity="dice",
                    idf=False)

Similarity column 7947 ( 100 % ), 2602.28 column/sec, elapsed time 0.05 min


In [50]:
def prepare_submission(urm_train: sps.csr_matrix, recommender: object) -> list:
    """CREA LA LISTA DI TUPLE (USERID,LISTA DI ITEMIDS RACCOMANDATI)"""
    users_to_recommend = pd.read_csv("./data/data_target_users_test.csv",
                                     names=["user_id"],
                                     header=None,
                                     skiprows=1,
                                     dtype={"user_id": np.int32})

    recommendation_length = 10
    submission = []
    for user_id in users_to_recommend.user_id:
        recommendations = recommender.recommend(user_id=user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length)
        submission.append((user_id, recommendations))

    return submission

In [51]:
submission = prepare_submission(urm_train_validation, best_recommender)

In [52]:
submission

[(0,
  array([ 9851,  1447,  8267, 25903,  7500, 11310,  8572,  9050, 20108,
          6140], dtype=int64)),
 (1,
  array([19089, 23600, 20095, 12409, 19709, 16630,  3165, 18317, 19525,
         24075], dtype=int64)),
 (2,
  array([11548,  9549, 13251,  5396,  4842,  6242, 17336, 22230, 17959,
           991], dtype=int64)),
 (3,
  array([12473, 18736, 11679, 11681, 22487, 24075,  3942, 14446, 17578,
         19572], dtype=int64)),
 (4,
  array([ 9243, 10997,  4927,  1611, 14271, 18692, 18632,  9578, 10979,
         13427], dtype=int64)),
 (5,
  array([ 5581,  8097,  7583,  9658,  7494, 20345, 25407,  4257, 20095,
         19874], dtype=int64)),
 (6,
  array([25044, 23906, 24383,  1282, 14356,  5044,  4824, 15137,  3397,
          8894], dtype=int64)),
 (7,
  array([ 4209,  1467, 17305, 14266,  1124, 19189,  5214,  6752,  7646,
          7238], dtype=int64)),
 (8,
  array([25407,  1583, 10594, 23154,  2426,  6734, 10834, 13711, 16928,
          6878], dtype=int64)),
 (9,
  array([ 8059

In [None]:
from datetime import date
today = date.today().strftime("%d-%m-%y")
dm.write_submission(submission, "UserKNNCFRecommender-"+today)