# User Collaborative Filtering

This script accommodates implementation of user-collaborative filtering from the original GroupLens paper ([Resnick et al 1994](http://dx.doi.org/10.1145/192844.192905)).
See here for a full write up

## Generate Synthetic Data 

In [254]:
### Imports 
import pandas as pd 
import numpy as np
from scipy import sparse  
from copy import deepcopy
import matplotlib.pyplot as plt 

In [267]:
# Generate some fake rating data to play with 
n_users = 1_000 # number of users
n_items = 100 # of items
ratings = sparse.random(n_users, n_items, density=0.10, format='csr', random_state=1985)
ratings.data = np.random.randint(0, 5, size=ratings.data.shape).astype(float)


In [268]:
# example rating for a single row
print(ratings[0])

  (0, 11)	3.0
  (0, 16)	4.0
  (0, 23)	4.0
  (0, 26)	4.0
  (0, 34)	1.0
  (0, 38)	3.0
  (0, 86)	4.0
  (0, 89)	2.0
  (0, 95)	0.0
  (0, 98)	1.0


## Working with scipy.sparse

In [269]:
print(dir(ratings[0]))

['__abs__', '__add__', '__array_priority__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__idiv__', '__imul__', '__init__', '__init_subclass__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__lt__', '__matmul__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__pow__', '__radd__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rmatmul__', '__rmul__', '__round__', '__rsub__', '__rtruediv__', '__setattr__', '__setitem__', '__sizeof__', '__slotnames__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_add_dense', '_add_sparse', '_arg_min_or_max', '_arg_min_or_max_axis', '_asindices', '_binopt', '_cs_matrix__get_has_canonical_format', '_cs_matrix__get_sorted', '_cs_matrix__set_has_canonical_format', '_cs_matrix__set_sorted', '_deduped_data', '_divide', '_

In [270]:
# shape of first row
ratings[0].shape

(1, 100)

In [271]:
# Accessing and shape of certain rows and columns
print(f'''
    Row 1            dimension {ratings[0].shape}
    Row 1 transposed dimension {ratings[0].T.shape}
    Col 1            dimension {ratings[:, 0].shape}
    Col 1 transposed dimension {ratings[:, 0].T.shape}
    '''
)



    Row 1            dimension (1, 100)
    Row 1 transposed dimension (100, 1)
    Col 1            dimension (1000, 1)
    Col 1 transposed dimension (1, 1000)
    


In [272]:
# Multiplying rows
r1_norm2 = ratings[0].dot(ratings[0].T)
print(f'Inner product\n\t class: {type(r1_norm2)}\n\t shape: {r1_norm2.shape}\n\t value: {r1_norm2[0, 0]}')

Inner product
	 class: <class 'scipy.sparse.csr.csr_matrix'>
	 shape: (1, 1)
	 value: 88.0


## Writting a Similarity Method

In [273]:
def similarity(u, v, method='cosine'):   
    '''Return the similarity between entity u and v based on previous ratings

    Args: 
        u (sparse array) - the ratings vector for entity u 
        v (sparse array) - the ratings vector for entity v 
        method (str) - the similarity methods (cosine, adjusted-cosine, pearson)

    Returns: 
        float: A numerical score of the similarity between entities u and v ratings
    '''
    # format matrices
    if method=='cosine':
        sim = u.dot(v.T)[0,0] / (1e-5 +1.0 * np.sqrt(u.dot(u.T)[0,0] * v.dot(v.T)[0,0]))
        return(sim)        

    elif method=='pearson':
        # center vectors 
        u_cen, v_cen = deepcopy(u), deepcopy(v)
        u_cen.data -= u_cen.data.mean()
        v_cen.data -= v_cen.data.mean()

        # comptute similarity
        sim = u_cen.dot(v_cen.T)[0,0] / (1e-5 + 1.0 * np.sqrt(u_cen.dot(u_cen.T)[0,0] * v_cen.dot(v_cen.T)[0,0]))
        return(sim)

    else: 
        return("Please use method = {'cosine', 'pearson'}")

In [274]:
# Examples! 
ind1, ind2 = 0, 3
pearson = similarity(ratings[ind1], ratings[ind2], method='pearson')
cosine = similarity(ratings[ind1], ratings[ind2], method='cosine')

f'The pearson similarity is {np.round(pearson, 2)} and cosine similarity is {np.round(cosine, 2)}.'


'The pearson similarity is -0.14 and cosine similarity is 0.0.'

## User to User CF (Resnick 1994)

In [275]:
def cf_user_to_user(id, ratings, sim_method='pearson'):
    '''Return the imputed ratings for unobserved-ratings for entity u

    Args: 
        id (int) - id of the entity for the ratings vector to be applied for
        ratings (sparse matrix) - the ratings matrix to apply CF towards
        method (str) - the similarity methods (cosine, pearson)

    Returns: 
        cf_ratings (np.array): An np array with imputed estimates of ratings for un-rated articles. 
    '''
    # fetch average user rating
    n_users, n_items = ratings.shape
    
    # compute user-to-user similarity
    sims = np.array([similarity(ratings[id], ratings[user]) for user in range(n_users)])
    total_sim = np.sum(np.abs(sims))

    # residualize ratings
    ratings_resid = deepcopy(ratings)
    ratings_resid.data = ratings_resid.data.astype(float)
    for user in range(n_users): 
        ratings_resid[user].data -= ratings_resid[user].data.mean()
    
    # store previously rated item ratingss
    cf_ratings = np.full(n_items, np.nan)
    items_ranked = ratings[id].indices
    cf_ratings[items_ranked] = ratings[id].data

    # compute average rating for every item not ranked
    unrated_items = np.setdiff1d(np.arange(n_items), items_ranked)
    for item in unrated_items: 
        cf_ratings[item] = ratings[id].data.mean() + np.sum(sims * ratings[:,item])/total_sim

    # return ratings
    return(cf_ratings)



In [276]:
cf_user_to_user(1, ratings)

array([1.82957415, 1.82499751, 1.85299315, 1.87906179, 1.83613618,
       1.86996127, 1.83657094, 1.8580771 , 1.94402725, 1.88251208,
       1.87371842, 1.86202115, 1.8200888 , 1.8981778 , 1.91203758,
       1.        , 1.86173336, 1.87208495, 1.86493801, 4.        ,
       1.82864583, 1.86936068, 1.78605192, 1.86331963, 1.82654638,
       1.92066823, 1.        , 3.        , 1.86705059, 0.        ,
       1.8646681 , 1.        , 1.86397559, 1.81707945, 1.85286705,
       1.82227709, 1.83254404, 1.85650547, 0.        , 1.89703854,
       1.78317043, 1.90312655, 1.89427064, 1.89091876, 1.82548385,
       1.81282133, 1.82874793, 1.88443935, 1.86564691, 1.84104925,
       1.        , 1.90751662, 1.85747663, 1.95335813, 0.        ,
       1.89969457, 1.84086516, 1.84756262, 1.83743379, 2.        ,
       1.82107291, 1.84559239, 1.90879037, 1.81514726, 1.84039083,
       1.88859077, 1.90118094, 1.84929746, 1.83720612, 1.86472832,
       2.        , 1.88060354, 1.90305685, 1.89286469, 1.88215

In [None]:
import numpy as np
from scipy import sparse  
from copy import deepcopy

class CollabFilter(): 
    def __init__(self): 
        self.n_items = None
        self.n_users = None
        self.ratings = None
        self.cf_ratings = None 

    def generate_data(self, n_users, n_items, density=0.1, random_state=1985): 
        """
        Generate a random sparse ratings matrix.
        """
        # initialize
        self.n_users = n_users
        self.n_items = n_items

        # set up random generate 
        rng = np.random.default_rng(random_state)
        self.ratings = sparse.random(n_users, n_items, density=density, format='csr', random_state=rng)
        self.ratings.data = rng.integers(1, 6, size=self.ratings.data.shape).astype(float)

    def load_data(self, ratings):
        """
        Load an external sparse matrix as the ratings matrix.
        """
        if not isinstance(ratings, scipy.sparse.csr.csr_matrix): 
            raise ValueError("Ratings must be a scipy.sparse.csr_matrix.")
        
        self.n_users, self.n_items = ratings.shape
        self.ratings = ratings 
        
    @staticmethod 
    def similarity(u, v, method='cosine'):   
        '''Return the similarity between entity u and v based on previous ratings

        Args: 
            u (sparse array) - the ratings vector for entity u 
            v (sparse array) - the ratings vector for entity v 
            method (str) - the similarity methods (cosine, adjusted-cosine, pearson)

        Returns: 
            float: A numerical score of the similarity between entities u and v ratings
        '''
        # format matrices
        if method=='cosine':
            # compute l2-norms 
            u_norm = np.sqrt(u.dot(u.T)[0,0])
            v_norm = np.sqrt(v.dot(v.T)[0,0])

            # return similarity
            if u_norm == 0.0 or v_norm == 0.0: 
                return(0.0)
            return u.dot(v.T)[0,0] / (u_norm * v_norm) 

        elif method=='pearson':
            # center vectors 
            u_cen, v_cen = deepcopy(u), deepcopy(v)
            u_cen.data -= u_cen.data.mean()
            v_cen.data -= v_cen.data.mean()

            # comptute similarity
            sim = u_cen.dot(v_cen.T)[0,0] / (1e-5 + 1.0 * np.sqrt(u_cen.dot(u_cen.T)[0,0] * v_cen.dot(v_cen.T)[0,0]))
            return(sim)

        else: 
            raise ValueError("Please use method = {'cosine', 'pearson'}")
        

    def collaborative_filter(self, id, ratings, cf_method='user-to-user', sim_method='pearson'):
        '''Return the imputed ratings for unobserved-ratings for entity u

        Args: 
            id (int) - id of the entity for the ratings vector to be applied for
            ratings (sparse matrix) - the ratings matrix to apply CF towards
            method (str) - the similarity methods (cosine, pearson)

        Returns: 
            cf_ratings (np.array): An np array with imputed estimates of ratings for un-rated articles. 
        '''
        if self.ratings is None:
            raise ValueError("Ratings matrix not initialized. Use generate_data or load_data.")

        if not (0 <= id < self.n_users):
            raise ValueError(f"Invalid user ID. Must be in range [0, {self.n_users - 1}].")

        # compute user-to-user similarity
        sims = np.array([self.similarity(self.ratings[id], self.ratings[user]) for user in range(self.n_users)])
        total_sim = np.sum(np.abs(sims))

        # residualize ratings
        ratings_resid = deepcopy(self.ratings)
        ratings_resid.data = ratings_resid.data.astype(float)
        for user in range(self.n_users): 
            ratings_resid[user].data -= ratings_resid[user].data.mean()
        
        # store previously rated item ratings
        self.cf_ratings = np.full(self.n_items, np.nan)
        items_ranked = self.ratings[id].indices
        self.cf_ratings[items_ranked] = self.ratings[id].data

        # compute average rating for every item not ranked
        unrated_items = np.setdiff1d(np.arange(self.n_items), items_ranked)
        for item in unrated_items: 
            self.cf_ratings[item] = ratings[id].data.mean() + np.sum(sims * self.ratings[:,item])/total_sim

        return(1)

    


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 19)