### Importing of libraries

In [1]:
import os
from typing import Tuple, Callable, Dict, Optional, List

import numpy as np
import pandas as pd
import scipy.sparse as sps

from sklearn.model_selection import train_test_split

### Dataset Loading

In [22]:
def load_data():
    return pd.read_csv("./data/data_train.csv",
                      sep=",",
                      names=["user_id","item_id","impl_rating"],
                      header=None,
                      skiprows=1,
                      dtype={"user_id": np.int32,
                             "item_id": np.int32,
                             "impl_rating": np.int32})


In [23]:
impl_ratings = load_data()

In [25]:
impl_ratings

Unnamed: 0,user_id,item_id,impl_rating
0,0,10080,1
1,0,19467,1
2,1,2665,1
3,1,7494,1
4,1,17068,1
...,...,...,...
113263,7945,2476,1
113264,7945,12319,1
113265,7945,21384,1
113266,7946,8699,1


### Data Preprocessing

In [43]:
def preprocess_data(ratings: pd.DataFrame):
    unique_users = impl_ratings.user_id.unique()
    unique_items = impl_ratings.item_id.unique()
    
    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()
    
    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)
    
    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})
    
    ratings = pd.merge(left=ratings,
                      right=mapping_user_id,
                      how="inner",
                      on="user_id")
    
    ratings = pd.merge(left=ratings,
                      right=mapping_item_id,
                      how="inner",
                      on="item_id")
    return ratings

In [45]:
ratings = preprocess_data(impl_ratings)

7947 0 7946
24896 0 25974


In [46]:
ratings

Unnamed: 0,user_id,item_id,impl_rating,mapped_user_id,mapped_item_id
0,0,10080,1,0,0
1,4342,10080,1,4342,0
2,5526,10080,1,5526,0
3,5923,10080,1,5923,0
4,0,19467,1,0,1
...,...,...,...,...,...
113263,7944,22542,1,7944,24891
113264,7944,24806,1,7944,24892
113265,7944,24912,1,7944,24893
113266,7944,24990,1,7944,24894


### Dataset Splitting

In [55]:
def dataset_splits(ratings,num_users,num_items, val_perc: float, test_perc: float):
    seed=9876
    
    (uid_training, uid_test,
    iid_training, iid_test,
    ratings_training, ratings_test) = train_test_split(ratings.mapped_user_id,
                                                      ratings.mapped_item_id,
                                                      ratings.impl_rating,
                                                      test_size=test_perc,
                                                      shuffle=True,
                                                      random_state=seed)
    (uid_training, uid_validation,
    iid_training, iid_validation,
    ratings_training, ratings_validation) = train_test_split(uid_training,
                                                            iid_training,
                                                            ratings_training,
                                                            test_size=val_perc)
    
    urm_train = sps.csr_matrix((ratings_training,(uid_training,iid_training)), shape=(num_users,num_items))
    urm_val = sps.csr_matrix((ratings_validation,(uid_validation,iid_validation)), shape=(num_users,num_items))
    urm_test = sps.csr_matrix((ratings_test,(uid_test,iid_test)), shape=(num_users,num_items))
    
    return urm_train,urm_val,urm_test

In [56]:
urm_train,urm_val,urm_test = dataset_splits(ratings, 
                                            num_users=7947, 
                                            num_items=24896, 
                                            val_perc=0.1, 
                                            test_perc=0.2)

In [57]:
urm_train

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 81552 stored elements in Compressed Sparse Row format>

In [58]:
urm_val

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 9062 stored elements in Compressed Sparse Row format>

In [59]:
urm_test

<7947x24896 sparse matrix of type '<class 'numpy.intc'>'
	with 22654 stored elements in Compressed Sparse Row format>

## Cosine Similarity

We can implement different versions of a cosine similarity. Some of these are faster and others are slower.

The most simple version is just to loop item by item and calculate the similarity of item pairs.
$$ W_{i,j} 
= cos(v_i, v_j) 
= \frac{v_i \cdot v_j}{|| v_i || ||v_j ||} 
= \frac{\Sigma_{u \in U}{URM_{u,i} \cdot URM_{u,j}}}{\sqrt{\Sigma_{u \in U}{URM_{u,i}^2}} \cdot \sqrt{\Sigma_{u \in U}{URM_{u,j}^2}} + shrink} $$


Another (faster) version of the similarity is by operating on vector products
$$ W_{i,I} 
= cos(v_i, URM_{I}) 
= \frac{v_i \cdot URM_{I}}{|| v_i || IW_{I} + shrink} $$

and where 

$$ IW_{i} = \sqrt{{\Sigma_{u \in U}{URM_{u,i}^2}}}$$

Lastly, a faster but more memory-intensive version of the similarity is by operating on matrix products
$$ W  
= \frac{URM^{t} \cdot URM}{IW^{t} IW + shrink} $$

In [92]:
def similarity(urm: sps.csc_matrix, shrink: int):
    item_weights = np.sqrt(
                        np.sum(
                        urm_train.tocsc().power(2),
                        axis=0
                        )
                   ).A.flatten()
    
    num_items = urm.shape[1]
    item_dot_product = urm.T.dot(urm).todense()
    
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = item_dot_product[item_id]
        denominator = item_weights[item_id]* item_weights + shrink + 1e-6 
        
        weights[item_id] = numerator / denominator
        
    np.fill_diagonal(weights, 0.0)
    
    return weights
    

In [93]:
%%time

weights = similarity(urm_train.tocsc(), shrink=5)

Wall time: 13.4 s
