### Importing of libraries

In [1]:
import os
from typing import Tuple, Callable, Dict, Optional, List

import numpy as np
import pandas as pd
import scipy.sparse as sps

from sklearn.model_selection import train_test_split

### Dataset Loading

In [2]:
from modUtils import dataManager as dm

urm_matrix = dm.load_data()
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating
0,0,10080,1
1,0,19467,1
2,1,2665,1
3,1,7494,1
4,1,17068,1
...,...,...,...
113263,7945,2476,1
113264,7945,12319,1
113265,7945,21384,1
113266,7946,8699,1


In [3]:
icm_all = dm.load_icm()
icm_all

Unnamed: 0,item_id,feature_id,weighted_value
0,0,1185,1.015524
1,0,2507,0.459024
2,0,3534,0.227742
3,0,8766,0.501549
4,0,10862,0.297011
...,...,...,...
490686,25974,12554,0.963016
490687,25974,13003,0.104613
490688,25974,16236,0.118760
490689,25974,18797,0.363301


### Data Preprocessing

In [4]:
urm_matrix = dm.preprocess_data(urm_matrix)

Numero di users: 7947, UserId minimo: 0, UserId massimo: 7946
Numero di items: 24896, ItemId minimo: 0, ItemId massimo: 25974
Sparsity della URM: 0.057 %


In [5]:
urm_matrix

Unnamed: 0,user_id,item_id,impl_rating,mapped_user_id,mapped_item_id
0,0,10080,1,0,0
1,4342,10080,1,4342,0
2,5526,10080,1,5526,0
3,5923,10080,1,5923,0
4,0,19467,1,0,1
...,...,...,...,...,...
113263,7944,22542,1,7944,24891
113264,7944,24806,1,7944,24892
113265,7944,24912,1,7944,24893
113266,7944,24990,1,7944,24894


In [6]:
icm_matrix = dm.preprocess_icm(icm_all)

Numero di items: 25950, ItemId minimo: 0, ItemId massimo: 25974
Numero di features: 19998, FeatureId minimo: 0, FeatureId massimo: 19999
Sparsity della ICM: 0.095 %


In [7]:
icm_matrix

Unnamed: 0,item_id,feature_id,weighted_value,mapped_item_id,mapped_feature_id
0,0,1185,1.015524,0,0
1,640,1185,1.010318,640,0
2,2052,1185,0.963280,2050,0
3,3416,1185,1.077346,3413,0
4,4188,1185,0.963479,4185,0
...,...,...,...,...,...
490686,25944,11329,0.939752,25919,19994
490687,25566,6013,1.065640,25541,19995
490688,25934,6013,0.953319,25909,19995
490689,25634,6416,1.056006,25609,19996


In [8]:
icm_matrix = sps.csr_matrix((icm_matrix.weighted_value, (icm_matrix.item_id, icm_matrix.feature_id)))
icm_matrix

<25975x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 490691 stored elements in Compressed Sparse Row format>

### Dataset Splitting (NO MAPPING)

In [9]:
def dataset_splits(ratings, num_users, num_items, val_perc: float, test_perc: float) -> \
        (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    """DIVIDE IL DATASET DEL URM IN TRAINING, VALUATION AND TEST SETS SENZA MAPPING"""
    seed = 9876

    (uid_training, uid_test,
     iid_training, iid_test,
     ratings_training, ratings_test) = train_test_split(ratings.user_id,
                                                        ratings.item_id,
                                                        ratings.impl_rating,
                                                        test_size=test_perc,
                                                        shuffle=True,
                                                        random_state=seed)
    (uid_training, uid_validation,
     iid_training, iid_validation,
     ratings_training, ratings_validation) = train_test_split(uid_training,
                                                              iid_training,
                                                              ratings_training,
                                                              test_size=val_perc)

    urm_train = sps.csr_matrix((ratings_training, (uid_training, iid_training)), shape=(7947,25975))
    urm_val = sps.csr_matrix((ratings_validation, (uid_validation, iid_validation)), shape=(7947,25975))
    urm_test = sps.csr_matrix((ratings_test, (uid_test, iid_test)), shape=(7947,25975))

    return urm_train, urm_val, urm_test

In [10]:
urm_train,urm_val,urm_test = dataset_splits(urm_matrix, 
                                            num_users=7947, 
                                            num_items=24896, 
                                            val_perc=0.1, 
                                            test_perc=0.2)

In [11]:
urm_train

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 81552 stored elements in Compressed Sparse Row format>

In [12]:
urm_val

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 9062 stored elements in Compressed Sparse Row format>

In [13]:
urm_test

<7947x25975 sparse matrix of type '<class 'numpy.intc'>'
	with 22654 stored elements in Compressed Sparse Row format>

### ItemKNN Content-Based Filtering Recommender

In [14]:
from Base.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

class ItemKNNCBFRecommender(object):

    def __init__(self, ICM):
        self.ICM = ICM

    def fit(self, topK=50, shrink=100, normalize=False, similarity="cosine"):
        similarity_object = Compute_Similarity_Python(self.ICM.T, shrink=shrink,
                                                      topK=topK, normalize=normalize,
                                                      similarity=similarity)

        self.W_sparse = similarity_object.compute_similarity()

    def recommend(self, user_id: int, urm_train: sps.csr_matrix, at: Optional[int] = None, remove_unseen: bool = True):
        # compute the scores using the dot product
        user_profile = urm_train[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if remove_unseen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]

    def filter_seen(self, user_id, scores):
        start_pos = urm_train.indptr[user_id]
        end_pos = urm_train.indptr[user_id + 1]

        user_profile = urm_train.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [15]:
cbfknnUN_recommender = ItemKNNCBFRecommender(ICM=icm_matrix)

In [16]:
%%time

cbfknnUN_recommender.fit()

Similarity column 25975 ( 100 % ), 1432.72 column/sec, elapsed time 0.30 min
Wall time: 19.2 s


In [34]:
for user_id in range(10):
    print(cbfknnUN_recommender.recommend(user_id=user_id,
                                  at=10, 
                                  urm_train=urm_train))

[ 7908 13005 14168   413 17300 23649 24573 13653  9772   541]
[ 6198 16814    26 14053 20310 19434 18829  5664 22461  9142]
[14353  5593  1353 11936  2368  9835  8877 21613 18003  5923]
[16086 12157   519 13592   825 13209  8561 11626 25476  6394]
[ 3162 17336  7704  6397 18390  3363  1671 19601 17097 12756]
[17819 22517  9301 15150 20473 19288  4549  8097 13833 23204]
[  426 25930 11295  6640  3041  3569 21470 17590 11619  3161]
[17332  4314 20620  8104 13840 17963 11300 15897 20104  7949]
[14059  9316 24317  1014 18148 13091  4044  3502 16761 17673]
[ 5741  8211  5045 18270  3699 11024 15782  5014 11580 13648]


### Evaluator

In [35]:
from modUtils import evaluator as ev

In [36]:
%%time

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = ev.evaluate(cbfknnUN_recommender,
                                                                                            urm_train,
                                                                                            urm_test)

Wall time: 24.7 s


In [37]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.01363961387200573, 0.043407766945681714, 0.02022342701431265, 5594, 2353)

### Hyperparameter Tuning

In [44]:
def hyperparameter_tuning(hyperparam: str, **kargs):
    """
    :param hyperparam:      "normalized"   
                            "similarity"    
                            "shrink"       
                            "topK"
                            
    :param kargs:     topK: int, shrink: int, normalize: boolean, similarity: str     
    """
    results = []
    if hyperparam == "normalized":
        values = [False,True]
        for norm in values:
            print(f"Currently trying normalized {norm}")
            recommender = ItemKNNCBFRecommender(ICM=icm_matrix)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=norm, similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((norm, ev_map))
    elif hyperparam == "similarity":
        similarities = ["cosine","jaccard", "dice","tversky", "tanimoto"]
        for similarity in similarities:
            print(f"Currently trying similarity {similarity}")
            recommender = ItemKNNCBFRecommender(ICM=icm_matrix)
            recommender.fit(topK=kargs['topK'], shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=similarity)
            _, _, ev_map,_,_ = ev.evaluate(recommender, urm_train, urm_val)
            results.append((similarity, ev_map))
    elif hyperparam == "shrink":
        shrinks = [0,1,5,10,50,100,500,1000]
        for shrink in shrinks:
            print(f"Currently trying shrink {shrink}")
            recommender = ItemKNNCBFRecommender(ICM=icm_matrix)
            recommender.fit(topK=kargs['topK'], shrink=shrink, normalize=kargs['normalize'], similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((shrink,ev_map))        
    elif hyperparam == "topK":
        values = list(range(50,201,10))
        for topK in values:
            print(f"Currently trying topK {topK}")
            recommender = ItemKNNCBFRecommender(ICM=icm_matrix)
            recommender.fit(topK=topK, shrink=kargs['shrink'], normalize=kargs['normalize'], similarity=kargs['similarity'])
            _, _, ev_map,_,_  = ev.evaluate(recommender, urm_train, urm_val)
            results.append((topK,ev_map))   
        
    return results

### Normalize or Not?

In [45]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="normalized",
                                               topK=50,
                                               shrink=100,
                                               normalize=None,
                                               similarity="cosine")

Currently trying normalized False
Similarity column 25975 ( 100 % ), 1499.18 column/sec, elapsed time 0.29 min
Currently trying normalized True
Similarity column 25975 ( 100 % ), 1411.40 column/sec, elapsed time 0.31 min
Wall time: 1min 10s


In [46]:
hyperparameter_results

[(False, 0.0214745533867517), (True, 0.021710597290978843)]

### Which type of similarity?

In [47]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="similarity",
                                               topK=50,
                                               shrink=100,
                                               normalize=True,
                                               similarity=None)

Currently trying similarity cosine
Similarity column 25975 ( 100 % ), 1361.02 column/sec, elapsed time 0.32 min
Currently trying similarity asymmetric
Similarity column 25975 ( 100 % ), 1394.98 column/sec, elapsed time 0.31 min
Currently trying similarity jaccard
Similarity column 25975 ( 100 % ), 1210.13 column/sec, elapsed time 0.36 min
Currently trying similarity dice
Similarity column 25975 ( 100 % ), 1369.61 column/sec, elapsed time 0.32 min
Currently trying similarity tversky
Similarity column 25975 ( 100 % ), 1021.15 column/sec, elapsed time 0.42 min
Currently trying similarity tanimoto
Similarity column 25975 ( 100 % ), 1247.21 column/sec, elapsed time 0.35 min
Wall time: 3min 48s


In [48]:
hyperparameter_results

[('cosine', 0.021710597290978843),
 ('asymmetric', 0.021710597290978843),
 ('jaccard', 0.028561863687013363),
 ('dice', 0.028568689585264463),
 ('tversky', 0.028561863687013363),
 ('tanimoto', 0.028561863687013363)]

### What is the optimal value of shrink?

In [55]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="shrink",
                                               topK=50,
                                               shrink=None,
                                               normalize=True,
                                               similarity="dice")

Currently trying shrink 0
Similarity column 25975 ( 100 % ), 1340.91 column/sec, elapsed time 0.32 min
Currently trying shrink 1
Similarity column 25975 ( 100 % ), 1417.08 column/sec, elapsed time 0.31 min
Currently trying shrink 5
Similarity column 25975 ( 100 % ), 1413.46 column/sec, elapsed time 0.31 min
Currently trying shrink 10
Similarity column 25975 ( 100 % ), 1406.06 column/sec, elapsed time 0.31 min
Currently trying shrink 50
Similarity column 25975 ( 100 % ), 1401.51 column/sec, elapsed time 0.31 min
Currently trying shrink 100
Similarity column 25975 ( 100 % ), 1400.16 column/sec, elapsed time 0.31 min
Currently trying shrink 500
Similarity column 25975 ( 100 % ), 1383.21 column/sec, elapsed time 0.31 min
Currently trying shrink 1000
Similarity column 25975 ( 100 % ), 1384.76 column/sec, elapsed time 0.31 min
Wall time: 4min 47s


In [56]:
hyperparameter_results

[(0, 0.027975149484089433),
 (1, 0.02780364364626841),
 (5, 0.02777522122997794),
 (10, 0.027878135537774237),
 (50, 0.02905615413247087),
 (100, 0.028568689585264463),
 (500, 0.02823105243707349),
 (1000, 0.028179352857113926)]

### What is the optimal value of topK?

In [57]:
%%time

hyperparameter_results = hyperparameter_tuning(hyperparam="topK",
                                               topK=None,
                                               shrink=50,
                                               normalize=True,
                                               similarity="dice")

Currently trying topK 50
Similarity column 25975 ( 100 % ), 1372.64 column/sec, elapsed time 0.32 min
Currently trying topK 60
Similarity column 25975 ( 100 % ), 1405.00 column/sec, elapsed time 0.31 min
Currently trying topK 70
Similarity column 25975 ( 100 % ), 1363.80 column/sec, elapsed time 0.32 min
Currently trying topK 80
Similarity column 25975 ( 100 % ), 1380.57 column/sec, elapsed time 0.31 min
Currently trying topK 90
Similarity column 25975 ( 100 % ), 1327.99 column/sec, elapsed time 0.33 min
Currently trying topK 100
Similarity column 25975 ( 100 % ), 1337.33 column/sec, elapsed time 0.32 min
Currently trying topK 110
Similarity column 25975 ( 100 % ), 1335.81 column/sec, elapsed time 0.32 min
Currently trying topK 120
Similarity column 25975 ( 100 % ), 1307.39 column/sec, elapsed time 0.33 min
Currently trying topK 130
Similarity column 25975 ( 100 % ), 1357.40 column/sec, elapsed time 0.32 min
Currently trying topK 140
Similarity column 25975 ( 100 % ), 1252.20 column/se

In [58]:
hyperparameter_results

[(50, 0.02905615413247087),
 (60, 0.028700917493640215),
 (70, 0.02813595297432908),
 (80, 0.028193692305912094),
 (90, 0.028035139204518462),
 (100, 0.028301011221672705),
 (110, 0.027816253718849795),
 (120, 0.027569108194783493),
 (130, 0.027535258947452387),
 (140, 0.027333058048564177),
 (150, 0.027366299498050837),
 (160, 0.027370060061567898),
 (170, 0.027358091967119132),
 (180, 0.027768083404311236),
 (190, 0.027593743271562932),
 (200, 0.02718883707777099)]

### Submission to competition

In [60]:
best= {}

urm_train_validation = urm_train + urm_val + urm_test

In [64]:
best_recommender = ItemKNNCBFRecommender(ICM=icm_matrix)
best_recommender.fit(normalize=True,
                    topK=50,
                    shrink=50,
                    similarity="dice")

Similarity column 25975 ( 100 % ), 1398.51 column/sec, elapsed time 0.31 min


In [67]:
def prepare_submission(urm_train: sps.csr_matrix, recommender: object) -> list:
    """CREA LA LISTA DI TUPLE (USERID,LISTA DI ITEMIDS RACCOMANDATI)"""
    users_to_recommend = pd.read_csv("./data/data_target_users_test.csv",
                                     names=["user_id"],
                                     header=None,
                                     skiprows=1,
                                     dtype={"user_id": np.int32})

    recommendation_length = 10
    submission = []
    for user_id in users_to_recommend.user_id:
        recommendations = recommender.recommend(user_id=user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length)
        submission.append((user_id, recommendations))

    return submission

In [68]:
submission = prepare_submission(urm_train_validation, best_recommender)

In [69]:
submission

[(0,
  array([ 4518, 24497, 13219, 21339, 20699, 18857, 11252,  6856,  8480,
         25878], dtype=int64)),
 (1,
  array([ 6198, 12119, 18554, 10829,  7640, 21225,  3856, 12045,  3165,
         17059], dtype=int64)),
 (2,
  array([14353,  5566, 12684, 11563,  1291, 14275, 13782, 17514,  7849,
         11764], dtype=int64)),
 (3,
  array([10689,  9479, 12860,  7471, 10012, 23128, 16086,  1651,  2373,
         12157], dtype=int64)),
 (4,
  array([14715, 13427, 13480,  7281,  1428, 12587,  5191, 21101, 20785,
         17996], dtype=int64)),
 (5,
  array([15150, 22517,  9039,  9301, 16172, 19152, 18882, 18109, 18495,
         19797], dtype=int64)),
 (6,
  array([ 5808, 13776, 24302, 20999, 20237, 11281,  2786,  7342,  4166,
         14020], dtype=int64)),
 (7,
  array([17332, 13840,  8499,  7646,  3850, 12181, 21019,  9314,  3903,
          4314], dtype=int64)),
 (8,
  array([25138,  1014, 17967, 20181,  2293,  1249, 23857, 14059,  2993,
         13017], dtype=int64)),
 (9,
  array([ 3699

In [70]:
from datetime import date
today = date.today().strftime("%d-%m-%y")
dm.write_submission(submission, "ItemKNNCBFRecommender-"+today)