### Importing of libraries

In [1]:
import os
from typing import Tuple, Callable, Dict, Optional, List

import numpy as np
import pandas as pd
import scipy.sparse as sps

from sklearn.model_selection import train_test_split

### Dataset Loading

In [2]:
from modUtils import dataManager as dm

impl_ratings = dm.load_data()

In [3]:
impl_ratings

Unnamed: 0,user_id,item_id,impl_rating
0,0,10080,1
1,0,19467,1
2,1,2665,1
3,1,7494,1
4,1,17068,1
...,...,...,...
113263,7945,2476,1
113264,7945,12319,1
113265,7945,21384,1
113266,7946,8699,1


### Data Preprocessing

In [4]:
ratings = dm.preprocess_data(impl_ratings)

7947 0 7946
24896 0 25974


In [5]:
ratings

Unnamed: 0,user_id,item_id,impl_rating,mapped_user_id,mapped_item_id
0,0,10080,1,0,0
1,4342,10080,1,4342,0
2,5526,10080,1,5526,0
3,5923,10080,1,5923,0
4,0,19467,1,0,1
...,...,...,...,...,...
113263,7944,22542,1,7944,24891
113264,7944,24806,1,7944,24892
113265,7944,24912,1,7944,24893
113266,7944,24990,1,7944,24894


### Dataset Splitting

In [6]:
urm_train,urm_val,urm_test = dm.dataset_splits(ratings, 
                                            num_users=7947, 
                                            num_items=24896, 
                                            val_perc=0.1, 
                                            test_perc=0.2)

In [7]:
urm_train

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 81552 stored elements in Compressed Sparse Row format>

In [8]:
urm_val

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 9062 stored elements in Compressed Sparse Row format>

In [9]:
urm_test

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 22654 stored elements in Compressed Sparse Row format>

### First Function for similarity

In [13]:
def similarity(urm: sps.csc_matrix, shrink: int):
    item_weights = np.sqrt(
                        np.sum(
                        urm_train.tocsc().power(2),
                        axis=0
                        )
                   ).A.flatten()
    
    num_items = urm.shape[1]
    item_dot_product = urm.T.dot(urm).todense()
    
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = item_dot_product[item_id]
        denominator = item_weights[item_id]* item_weights + shrink + 1e-6 
        
        weights[item_id] = numerator / denominator
        
    np.fill_diagonal(weights, 0.0)
    
    return weights
    

In [14]:
%%time

weights = similarity(urm_train.tocsc(), shrink=5)

Wall time: 7.73 s


### Cosine Similarity

We can implement different versions of a cosine similarity. Some of these are faster and others are slower.

The most simple version is just to loop item by item and calculate the similarity of item pairs.
$$ W_{i,j} 
= cos(v_i, v_j) 
= \frac{v_i \cdot v_j}{|| v_i || ||v_j ||} 
= \frac{\Sigma_{u \in U}{URM_{u,i} \cdot URM_{u,j}}}{\sqrt{\Sigma_{u \in U}{URM_{u,i}^2}} \cdot \sqrt{\Sigma_{u \in U}{URM_{u,j}^2}} + shrink} $$


In [15]:
def naive_similarity(urm: sps.csc_matrix, shrink: int):
    num_items = urm.shape[1]
    weights = np.empty(shape=(num_items, num_items))
    for item_i in range(num_items):
        item_i_profile = urm[:, item_i] # mx1 vector
        
        for item_j in range(num_items):
            item_j_profile = urm[:, item_j] # mx1 vector
                      
            numerator = item_i_profile.T.dot(item_j_profile).todense()[0,0]
            denominator = (np.sqrt(np.sum(item_i_profile.power(2)))
                           * np.sqrt(np.sum(item_j_profile.power(2)))
                           + shrink
                           + 1e-6)
            
            weights[item_i, item_j] = numerator / denominator
    
    np.fill_diagonal(weights, 0.0)
    return weights


Another (faster) version of the similarity is by operating on vector products
$$ W_{i,I} 
= cos(v_i, URM_{I}) 
= \frac{v_i \cdot URM_{I}}{|| v_i || IW_{I} + shrink} $$

and where 

$$ IW_{i} = \sqrt{{\Sigma_{u \in U}{URM_{u,i}^2}}}$$

In [16]:
def vector_similarity(urm: sps.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()
    
    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6
        
        weights[item_id] = numerator / denominator
        
    np.fill_diagonal(weights, 0.0)
    return weights
    

Lastly, a faster but more memory-intensive version of the similarity is by operating on matrix products
$$ W  
= \frac{URM^{t} \cdot URM}{IW^{t} IW + shrink} $$

In [10]:
def matrix_similarity(urm: sps.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A
    
    numerator = urm.T.dot(urm)
    denominator = item_weights.T.dot(item_weights) + shrink + 1e-6
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    
    return weights

In [12]:
urm_csc = urm_train.tocsc()
shrink = 5
slice_size = 100

In [19]:
%%time 
naive_weights = naive_similarity(urm_csc[:slice_size,:slice_size], shrink)
naive_weights

Wall time: 7.8 s


array([[0.        , 0.16666664, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16666664, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.16666664,
        0.16666664],
       [0.        , 0.        , 0.        , ..., 0.16666664, 0.        ,
        0.16666664],
       [0.        , 0.        , 0.        , ..., 0.16666664, 0.16666664,
        0.        ]])

In [20]:
%%time
vector_weights = vector_similarity(urm_csc[:slice_size,:slice_size], shrink)
vector_weights

Wall time: 35.9 ms


array([[0.        , 0.16666664, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.16666664, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.16666664,
        0.16666664],
       [0.        , 0.        , 0.        , ..., 0.16666664, 0.        ,
        0.16666664],
       [0.        , 0.        , 0.        , ..., 0.16666664, 0.16666664,
        0.        ]])

In [13]:
%%time
matrix_weights = matrix_similarity(urm_csc[:slice_size,:slice_size], shrink)
matrix_weights

Wall time: 12 ms


matrix([[0.        , 0.16666664, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.16666664, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.16666664,
         0.16666664],
        [0.        , 0.        , 0.        , ..., 0.16666664, 0.        ,
         0.16666664],
        [0.        , 0.        , 0.        , ..., 0.16666664, 0.16666664,
         0.        ]])

In [22]:
np.array_equal(naive_weights, vector_weights)

True

In [23]:
np.array_equal(vector_weights, matrix_weights)

True

### Collaborative Filtering ItemKNN Recommender

In [14]:
class CFItemKNN(object):
    
    def __init__(self, shrink:int):
        self.shrink = shrink
        self.weights = None
            
    
    def fit(self,urm_train: sps.csc_matrix, similarity_func: Callable[[sps.csc_matrix, int], np.array]):
        if not sps.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")
        
        self.weights = similarity_func(urm_train, self.shrink)
        
    def recommend(self, user_id: int, urm_train: sps.csr_matrix, at: Optional[int] = None, remove_unseen: bool = True):
        user_profile = urm_train[user_id]
        
        ranking = user_profile.dot(self.weights).A.flatten()
        
        if remove_unseen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]
            
            seen_items = urm_train.indices[user_profile_start:user_profile_end]
            
            ranking[seen_items] = -np.inf
            
        ranking = np.argsort(-ranking)
        return ranking[:at]


In [15]:
itemknn_recommender = CFItemKNN(shrink=50)

In [16]:
%%time

itemknn_recommender.fit(urm_train.tocsc(),matrix_similarity)

Wall time: 1min 4s


In [17]:
for user_id in range(10):
    print(itemknn_recommender.recommend(user_id=user_id,
                                  at=10, 
                                  urm_train=urm_train))

[ 1836 23196  1849  8104  8418  9408  2896  3021  1025  1707]
[1066 1704 1065 1105 1460 1526 2772  133 3235 5563]
[ 7240   562   736  1788 17378 24086 24087 14952 14949 20584]
[23336 23335 19640 18030  9120 10668  3377 19544   926  5233]
[1971 2841 1015 6855 1338 1642 3115 9825 7124  951]
[1912 5434  329    3 1502 1460  551  186 1065 1105]
[ 1609  3470  7617  6294  3420  3476  1424  1395 12479  1489]
[  338  1788 11632 11651  9473 11641 11616 11644 10346  2789]
[3041  303 2083 1824 6586 3974 4754 6268  552 2712]
[ 1066  4764  1065   489   445  8108  9090 10162  5004   238]


### Evaluator

In [19]:
from modUtils import evaluator as ev

In [21]:
%%time

accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = ev.evaluate(itemknn_recommender,
                                                                                            urm_train,
                                                                                            urm_test)

Wall time: 9.06 s


In [22]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.027368609224168304, 0.08757787863894913, 0.042599220283393996, 5594, 2353)

### Hyperparameter Tuning

In [23]:
def hyperparameter_tuning():
    shrinks = [0,1,5,10,50]
    results = []
    for shrink in shrinks:
        print(f"Currently trying shrink {shrink}")
        itemknn_recommender = CFItemKNN(shrink=shrink)
        itemknn_recommender.fit(urm_train.tocsc(), matrix_similarity)
        
        ev_precision, ev_recall, ev_map,_,_ = evaluator(itemknn_recommender, urm_train, urm_val)
        
        results.append((shrink,(ev_precision, ev_recall, ev_map)))
        
    return results

In [None]:
%%time

hyperparameter_results = hyperparameter_tuning()

In [None]:
hyperparameter_results

### Submission to competition

In [25]:
best_shrink = 50
urm_train_validation = urm_train + urm_val

In [26]:
best_recommender = CFItemKNN(shrink=best_shrink)
best_recommender.fit(urm_train_validation.tocsc(), matrix_similarity)

### User to recommend

In [35]:
import importlib
importlib.reload(dm)

<module 'modUtils.dataManager' from 'C:\\Users\\delta\\Desktop\\Scuola\\Recommended_Systems\\Project\\modUtils\\dataManager.py'>

In [36]:
submission = dm.prepare_submission(ratings, urm_train_validation, best_recommender)

In [37]:
submission

[(0, [1447, 20451, 20869, 649, 8887, 23245, 24911, 18150, 25878, 14462]),
 (1, [23600, 12409, 19089, 19709, 3165, 8894, 2533, 16630, 20095, 18317]),
 (2, [13658, 10554, 23266, 991, 25491, 14433, 25693, 22328, 17912, 18900]),
 (3, [5412, 18569, 24093, 2074, 15833, 17793, 25400, 13689, 11341, 2442]),
 (4, [9007, 9243, 1611, 11384, 11792, 12061, 19704, 23127, 13837, 19781]),
 (5, [8097, 7494, 22445, 19709, 25407, 1240, 19089, 13928, 15691, 8894]),
 (6, [25044, 23906, 14356, 13102, 4824, 24383, 18173, 1848, 4774, 1136]),
 (7, [4209, 25693, 17305, 5665, 23267, 11159, 11453, 8104, 133, 9081]),
 (8, [1583, 21123, 2426, 25407, 2551, 10594, 17819, 17156, 15691, 18392]),
 (9, [9609, 21552, 23600, 9637, 14291, 19089, 15562, 22359, 1863, 19480]),
 (10, [23361, 660, 3556, 19792, 22923, 16921, 12375, 23919, 6354, 297]),
 (11, [12788, 7856, 3220, 17482, 6015, 12951, 18969, 17797, 20876, 21085]),
 (12, [17858, 5711, 5789, 19255, 23127, 23207, 19114, 24561, 5906, 23711]),
 (13, [25407, 6446, 15691, 209

In [38]:
from datetime import date
today = date.today().strftime("%d-%m-%y")
dm.write_submission(submission, today)